diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..939e5341e74dc2371c8b47f0e27b50581bed5f63
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,28 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to
DeepLab2: A TensorFlow Library for Deep Labeling | Github Repo
" +gr.Interface( + inference, + [gr.inputs.Image(type="pil", label="Input")], + gr.outputs.Image(type="plot", label="Output"), + title=title, + description=description, + article=article, + examples=[ + ["city1.jpg"], + ["city2.jpg"] + ]).launch() diff --git a/common.py b/common.py new file mode 100644 index 0000000000000000000000000000000000000000..447ddea710a3f3dcdf49219a4940b8bc0ae7694e --- /dev/null +++ b/common.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains common methods and constants used across this framework.""" + +# Prediction keys used by the model output dictionary. +PRED_PANOPTIC_KEY = 'panoptic_pred' +PRED_SEMANTIC_KEY = 'semantic_pred' +PRED_INSTANCE_KEY = 'instance_pred' +PRED_INSTANCE_CENTER_KEY = 'instance_center_pred' + + +PRED_SEMANTIC_LOGITS_KEY = 'semantic_logits' +PRED_SEMANTIC_PROBS_KEY = 'semantic_probs' +PRED_INSTANCE_SCORES_KEY = 'instance_scores' +PRED_CENTER_HEATMAP_KEY = 'center_heatmap' +PRED_OFFSET_MAP_KEY = 'offset_map' +PRED_FRAME_OFFSET_MAP_KEY = 'frame_offset_map' +PRED_NEXT_OFFSET_MAP_KEY = 'next_offset_map' +PRED_NEXT_PANOPTIC_KEY = 'next_panoptic_pred' +PRED_CONCAT_NEXT_PANOPTIC_KEY = 'concat_next_panoptic_pred' + +PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY = 'pixel_space_normalized_feature' +PRED_PIXEL_SPACE_MASK_LOGITS_KEY = 'pixel_space_mask_logits' +PRED_TRANSFORMER_CLASS_LOGITS_KEY = 'transformer_class_logits' + +# Ground-truth keys used by the model. +GT_PANOPTIC_KEY = 'panoptic_gt' +GT_SEMANTIC_KEY = 'semantic_gt' +GT_INSTANCE_CENTER_KEY = 'instance_center_gt' +GT_INSTANCE_REGRESSION_KEY = 'instance_regression_gt' +GT_FRAME_OFFSET_KEY = 'frame_offset_gt' +GT_IS_CROWD = 'is_crowd_gt' +GT_THING_ID_MASK_KEY = 'thing_id_mask_gt' +GT_THING_ID_CLASS_KEY = 'thing_id_class_gt' +GT_NEXT_INSTANCE_REGRESSION_KEY = 'next_instance_regression_gt' + +# Raw labels. +GT_PANOPTIC_RAW = 'panoptic_raw' +GT_SEMANTIC_RAW = 'semantic_raw' +GT_IS_CROWD_RAW = 'is_crowd_raw' +GT_SIZE_RAW = 'size_raw' +GT_NEXT_PANOPTIC_RAW = 'next_panoptic_raw' + +# Loss keys. +SEMANTIC_LOSS = 'semantic_loss' +CENTER_LOSS = 'center_loss' +REGRESSION_LOSS = 'regression_loss' +MOTION_LOSS = 'motion_loss' +NEXT_REGRESSION_LOSS = 'next_regression_loss' +PQ_STYLE_LOSS = 'pq_style_loss' +# The PQ-style loss consists of a class term and a mask dice term. +PQ_STYLE_LOSS_CLASS_TERM = 'pq_style_loss_class_term' +PQ_STYLE_LOSS_MASK_DICE_TERM = 'pq_style_loss_mask_dice_term' +MASK_ID_CROSS_ENTROPY_LOSS = 'mask_id_cross_entropy_loss' +INSTANCE_DISCRIMINATION_LOSS = 'instance_discrimination_loss' +TOTAL_LOSS = 'total_loss' + +# Weight keys used by the model. +SEMANTIC_LOSS_WEIGHT_KEY = 'semantic_loss_weight' +CENTER_LOSS_WEIGHT_KEY = 'center_loss_weight' +REGRESSION_LOSS_WEIGHT_KEY = 'regression_loss_weight' +FRAME_REGRESSION_LOSS_WEIGHT_KEY = 'frame_regression_loss_weight' +NEXT_REGRESSION_LOSS_WEIGHT_KEY = 'next_regression_loss_weight' + +# Misc. +RESIZED_IMAGE = 'resized_image' +IMAGE = 'image' +IMAGE_NAME = 'image_name' +SEQUENCE_ID = 'sequence_id' +NEXT_IMAGE = 'next_image' + +# TfExample keys. +KEY_ENCODED_IMAGE = 'image/encoded' +KEY_ENCODED_PREV_IMAGE = 'prev_image/encoded' +KEY_ENCODED_NEXT_IMAGE = 'next_image/encoded' +KEY_IMAGE_FILENAME = 'image/filename' +KEY_IMAGE_FORMAT = 'image/format' +KEY_IMAGE_HEIGHT = 'image/height' +KEY_IMAGE_WIDTH = 'image/width' +KEY_IMAGE_CHANNELS = 'image/channels' +KEY_ENCODED_LABEL = 'image/segmentation/class/encoded' +KEY_ENCODED_PREV_LABEL = 'prev_image/segmentation/class/encoded' +KEY_ENCODED_NEXT_LABEL = 'next_image/segmentation/class/encoded' +KEY_LABEL_FORMAT = 'image/segmentation/class/format' +KEY_SEQUENCE_ID = 'video/sequence_id' +KEY_FRAME_ID = 'video/frame_id' +KEY_ENCODED_DEPTH = 'image/depth/encoded' +KEY_DEPTH_FORMAT = 'image/depth/format' + +# Checkpoint Items +# All models +CKPT_SEMANTIC_LAST_LAYER = 'semantic_last_layer' + +# DeepLabV3 +CKPT_DEEPLABV3_ASPP = 'deeplab_v3_aspp' +CKPT_DEEPLABV3_CLASSIFIER_CONV_BN_ACT = 'classifier_conv_bn_act' + +# DeepLabV3+ +CKPT_DEEPLABV3PLUS_ASPP = 'deeplab_v3plus_aspp' +CKPT_DEEPLABV3PLUS_PROJECT_CONV_BN_ACT = 'deeplab_v3plus_project_conv_bn_act' +CKPT_DEEPLABV3PLUS_FUSE = 'deeplab_v3plus_fuse' + +# Panoptic-DeepLab +CKPT_SEMANTIC_DECODER = 'semantic_decoder' +CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER = 'semantic_head_without_last_layer' + +CKPT_INSTANCE_DECODER = 'instance_decoder' +CKPT_INSTANCE_CENTER_HEAD_WITHOUT_LAST_LAYER = ('instance_center_head' + '_without_last_layer') +CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER = 'instance_center_head_last_layer' +CKPT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER = ('instance_regression_head' + '_without_last_layer') +CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER = 'instance_regression_head_last_layer' + +# Motion-DeepLab +CKPT_MOTION_REGRESSION_HEAD_WITHOUT_LAST_LAYER = ('motion_regression_head' + '_without_last_layer') +CKPT_MOTION_REGRESSION_HEAD_LAST_LAYER = 'motion_regression_head_last_layer' + +# ViP-DeepLab +CKPT_NEXT_INSTANCE_DECODER = 'next_instance_decoder' +CKPT_NEXT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER = ( + 'next_instance_regression_head_without_last_layer') +CKPT_NEXT_INSTANCE_REGRESSION_HEAD_LAST_LAYER = ( + 'next_instance_regression_head_last_layer') + +# MaX-DeepLab +CKPT_PIXEL_SPACE_HEAD = 'pixel_space_head' +CKPT_TRANSFORMER_MASK_HEAD = 'transformer_mask_head' +CKPT_TRANSFORMER_CLASS_HEAD = 'transformer_class_head' +CKPT_PIXEL_SPACE_FEATURE_BATCH_NORM = 'pixel_space_feature_batch_norm' +CKPT_PIXEL_SPACE_MASK_BATCH_NORM = 'pixel_space_mask_batch_norm' + +# Supported Tasks +TASK_PANOPTIC_SEGMENTATION = 'panoptic_segmentation' +TASK_INSTANCE_SEGMENTATION = 'instance_segmentation' +TASK_VIDEO_PANOPTIC_SEGMENTATION = 'video_panoptic_segmentation' +TASK_DEPTH_AWARE_VIDEO_PANOPTIC_SEGMENTATION = ( + 'depth_aware_video_panoptic_segmentation') diff --git a/common_test.py b/common_test.py new file mode 100644 index 0000000000000000000000000000000000000000..54587e52fc6555ffa20146b55dfb8615c8132877 --- /dev/null +++ b/common_test.py @@ -0,0 +1,74 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for common.py.""" +import tensorflow as tf + +from deeplab2 import common + + +class CommonTest(tf.test.TestCase): + + def test_constants_keys(self): + self.assertEqual(common.PRED_PANOPTIC_KEY, 'panoptic_pred') + self.assertEqual(common.PRED_SEMANTIC_KEY, 'semantic_pred') + self.assertEqual(common.PRED_INSTANCE_CENTER_KEY, 'instance_center_pred') + self.assertEqual(common.PRED_INSTANCE_KEY, 'instance_pred') + + self.assertEqual(common.PRED_SEMANTIC_LOGITS_KEY, 'semantic_logits') + self.assertEqual(common.PRED_CENTER_HEATMAP_KEY, 'center_heatmap') + self.assertEqual(common.PRED_OFFSET_MAP_KEY, 'offset_map') + self.assertEqual(common.PRED_FRAME_OFFSET_MAP_KEY, 'frame_offset_map') + + self.assertEqual(common.GT_PANOPTIC_KEY, 'panoptic_gt') + self.assertEqual(common.GT_SEMANTIC_KEY, 'semantic_gt') + self.assertEqual(common.GT_INSTANCE_CENTER_KEY, 'instance_center_gt') + self.assertEqual(common.GT_FRAME_OFFSET_KEY, 'frame_offset_gt') + self.assertEqual(common.GT_INSTANCE_REGRESSION_KEY, + 'instance_regression_gt') + self.assertEqual(common.GT_PANOPTIC_RAW, 'panoptic_raw') + self.assertEqual(common.GT_SEMANTIC_RAW, 'semantic_raw') + self.assertEqual(common.GT_SIZE_RAW, 'size_raw') + + self.assertEqual(common.SEMANTIC_LOSS_WEIGHT_KEY, 'semantic_loss_weight') + self.assertEqual(common.CENTER_LOSS_WEIGHT_KEY, 'center_loss_weight') + self.assertEqual(common.REGRESSION_LOSS_WEIGHT_KEY, + 'regression_loss_weight') + self.assertEqual(common.FRAME_REGRESSION_LOSS_WEIGHT_KEY, + 'frame_regression_loss_weight') + + self.assertEqual(common.RESIZED_IMAGE, 'resized_image') + self.assertEqual(common.IMAGE, 'image') + self.assertEqual(common.IMAGE_NAME, 'image_name') + self.assertEqual(common.SEQUENCE_ID, 'sequence_id') + + self.assertEqual(common.KEY_FRAME_ID, 'video/frame_id') + self.assertEqual(common.KEY_SEQUENCE_ID, 'video/sequence_id') + self.assertEqual(common.KEY_LABEL_FORMAT, 'image/segmentation/class/format') + self.assertEqual(common.KEY_ENCODED_PREV_LABEL, + 'prev_image/segmentation/class/encoded') + self.assertEqual(common.KEY_ENCODED_LABEL, + 'image/segmentation/class/encoded') + self.assertEqual(common.KEY_IMAGE_CHANNELS, 'image/channels') + self.assertEqual(common.KEY_IMAGE_WIDTH, 'image/width') + self.assertEqual(common.KEY_IMAGE_HEIGHT, 'image/height') + self.assertEqual(common.KEY_IMAGE_FORMAT, 'image/format') + self.assertEqual(common.KEY_IMAGE_FILENAME, 'image/filename') + self.assertEqual(common.KEY_ENCODED_PREV_IMAGE, 'prev_image/encoded') + self.assertEqual(common.KEY_ENCODED_IMAGE, 'image/encoded') + + +if __name__ == '__main__': + tf.test.main() diff --git a/compile.sh b/compile.sh new file mode 100644 index 0000000000000000000000000000000000000000..2afdcf2afc04835e81bc57f877a65bc6903d1ba1 --- /dev/null +++ b/compile.sh @@ -0,0 +1,114 @@ +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Quick start command line to setup deeplab2 (Linux only). +# Example command to run: +# deeplab2/compile.sh ${PATH_TO_PROTOC} +# +# This script assumes that the following folder structure: +# +# + root +# + deeplab2 +# + models +# + orbit +# + cocoapi +# + PythonAPI +# +# Besides, the script also assumes that `protoc` can be accessed from command +# line. + +#!/bin/bash + +set -e + +# cpu or gpu +CONFIG="cpu" + +function tolower() { + echo "${1,,}" +} + +if [[ ! -z "$1" ]] +then + echo "Setting configuration from argument($1)..." + CONFIG=$(tolower "$1") + if [ "$CONFIG" != "cpu" ] && [ "$CONFIG" != "gpu" ] + then + echo "Configuration must be either \"cpu\" or \"gpu\", exiting..." + exit 1 + fi +fi + +echo "Running configuration with $CONFIG." + +# Protobuf compilation +# Replace `protoc` with `${PATH_TO_PROTOC}` if protobuf compilier is downloaded +# from web. +echo "-----------------------------------------------------------------------" +echo "Compiling protobuf..." +echo "-----------------------------------------------------------------------" +protoc deeplab2/*.proto --python_out=. + +# Compile custom ops +# See details in https://www.tensorflow.org/guide/create_op#compile_the_op_using_your_system_compiler_tensorflow_binary_installation +TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') ) +TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') ) +OP_NAME='deeplab2/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op' + +if [ "$CONFIG" == "cpu" ] +then + # CPU + echo "-----------------------------------------------------------------------" + echo "Compiling the custom cc op: merge_semantic_and_instance_maps_op (CPU)..." + echo "-----------------------------------------------------------------------" + g++ -std=c++14 -shared \ + ${OP_NAME}.cc ${OP_NAME}_kernel.cc -o ${OP_NAME}.so -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2 +else + # GPU + # (https://www.tensorflow.org/guide/create_op#compiling_the_kernel_for_the_gpu_device) + echo "-----------------------------------------------------------------------" + echo "Compiling the custom cc op: merge_semantic_and_instance_maps_op (GPU)..." + echo "-----------------------------------------------------------------------" + nvcc -std=c++14 -c -o ${OP_NAME}_kernel.cu.o \ + ${OP_NAME}_kernel.cu.cc \ + ${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC --expt-relaxed-constexpr + + g++ -std=c++14 -shared -o ${OP_NAME}.so ${OP_NAME}.cc ${OP_NAME}_kernel.cc \ + ${OP_NAME}_kernel.cu.o ${TF_CFLAGS[@]} -fPIC -lcudart ${TF_LFLAGS[@]} +fi + +# PYTHONPATH +export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/models:`pwd`/cocoapi/PythonAPI + +# Runing test +echo "-----------------------------------------------------------------------" +echo "Running tests for merge_semantic_and_instance_maps_op..." +echo "-----------------------------------------------------------------------" +python deeplab2/tensorflow_ops/python/kernel_tests/merge_semantic_and_instance_maps_op_test.py + +# End-to-end tests +echo "-----------------------------------------------------------------------" +echo "Running end-to-end tests..." +echo "-----------------------------------------------------------------------" + +# Model training test (test for custom ops, protobug) +python deeplab2/model/deeplab_test.py + +# Model evaluation test (test for other packages such as orbit, cocoapi, etc) +python deeplab2/trainer/evaluator_test.py + +echo "------------------------" +echo "Done with configuration!" +echo "------------------------" + diff --git a/config.proto b/config.proto new file mode 100644 index 0000000000000000000000000000000000000000..f126375293957817ec9b848327614a7611276969 --- /dev/null +++ b/config.proto @@ -0,0 +1,40 @@ +// Copyright 2021 The Deeplab2 Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package deeplab2; + +import public 'deeplab2/dataset.proto'; +import public 'deeplab2/evaluator.proto'; +import public 'deeplab2/model.proto'; +import public 'deeplab2/trainer.proto'; + +option java_multiple_files = true; + +// Configure experiment options. +message ExperimentOptions { + // Set the experiment name. + optional string experiment_name = 1; + // Set the options for the model. + optional ModelOptions model_options = 2; + // Set the options for the trainer. + optional TrainerOptions trainer_options = 3; + // Set the options for the training dataset. + optional DatasetOptions train_dataset_options = 4; + // Set the options for the evaluator. + optional EvaluatorOptions evaluator_options = 5; + // Set the options for the validation dataset. + optional DatasetOptions eval_dataset_options = 6; +} diff --git a/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_1_os16.textproto b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_1_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..de31258e40781fc848a5bee1f386091f841c1a87 --- /dev/null +++ b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_1_os16.textproto @@ -0,0 +1,162 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with Axial-SWideRNet-(1, 1, 1) and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# Axial-SWideRNet-(1, 1, 1) applies the axial attention blocks (instead of +# convolutional blocks) to the last two stages of SWideRNet-(1, 1, 1). +# +# For axial attention, see +# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic +# Segmentation." In ECCV, 2020. +# For SWideRNet, see +# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for Panoptic +# Segmentation." arXiv: 2011.11675. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "axial_swidernet" + output_stride: 16 + stem_width_multiplier: 1 + backbone_width_multiplier: 1 + backbone_layer_multiplier: 1 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + aspp_use_only_1x1_proj_conv: true + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + aspp_use_only_1x1_proj_conv: true + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0001 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 32 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_3_os16.textproto b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_3_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..51c35608431a9a01adaa15851e052711b84497ad --- /dev/null +++ b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_3_os16.textproto @@ -0,0 +1,162 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with Axial-SWideRNet-(1, 1, 3) and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# Axial-SWideRNet-(1, 1, 3) applies the axial attention blocks (instead of +# convolutional blocks) to the last two stages of SWideRNet-(1, 1, 3). +# +# For axial attention, see +# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic +# Segmentation." In ECCV, 2020. +# For SWideRNet, see +# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for Panoptic +# Segmentation." arXiv: 2011.11675. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "axial_swidernet" + output_stride: 16 + stem_width_multiplier: 1 + backbone_width_multiplier: 1 + backbone_layer_multiplier: 3 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + aspp_use_only_1x1_proj_conv: true + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + aspp_use_only_1x1_proj_conv: true + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0001 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 32 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_4.5_os16.textproto b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_4.5_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..cb035f2e01ca28995affbf1b9ae57b888e7fe4e9 --- /dev/null +++ b/configs/cityscapes/axial_deeplab/axial_swidernet_1_1_4.5_os16.textproto @@ -0,0 +1,162 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with Axial-SWideRNet-(1, 1, 4.5) and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# Axial-SWideRNet-(1, 1, 4.5) applies the axial attention blocks (instead of +# convolutional blocks) to the last two stages of SWideRNet-(1, 1, 4.5). +# +# For axial attention, see +# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic +# Segmentation." In ECCV, 2020. +# For SWideRNet, see +# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for Panoptic +# Segmentation." arXiv: 2011.11675. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "axial_swidernet" + output_stride: 16 + stem_width_multiplier: 1 + backbone_width_multiplier: 1 + backbone_layer_multiplier: 4.5 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + aspp_use_only_1x1_proj_conv: true + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + aspp_use_only_1x1_proj_conv: true + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.000075 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 32 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/axial_deeplab/max_deeplab_l_backbone_os16.textproto b/configs/cityscapes/axial_deeplab/max_deeplab_l_backbone_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..42ce9f5074f2568b78eac5ee98c8f1a9abebaa55 --- /dev/null +++ b/configs/cityscapes/axial_deeplab/max_deeplab_l_backbone_os16.textproto @@ -0,0 +1,156 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with MaX-DeepLab-L backbone and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# This script employs the MaX-DeepLab-L backbone (i.e., without the memory +# path in the dual-path transformer blocks) as the network backbone. +# +# For MaX-DeepLab-L, see +# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with +# Mask Transformers." In CVPR, 2021. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "max_deeplab_l_backbone" + output_stride: 16 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + aspp_use_only_1x1_proj_conv: true + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + aspp_use_only_1x1_proj_conv: true + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.000075 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 32 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/axial_deeplab/max_deeplab_s_backbone_os16.textproto b/configs/cityscapes/axial_deeplab/max_deeplab_s_backbone_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..f4f75f21ac5c348d26d9a7a85c78c253c43656a2 --- /dev/null +++ b/configs/cityscapes/axial_deeplab/max_deeplab_s_backbone_os16.textproto @@ -0,0 +1,156 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with MaX-DeepLab-S backbone and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# This script employs the MaX-DeepLab-S backbone (i.e., without the memory +# path in the dual-path transformer blocks) as the network backbone. +# +# For MaX-DeepLab-S, see +# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with +# Mask Transformers." In CVPR, 2021. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "max_deeplab_s_backbone" + output_stride: 16 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + aspp_use_only_1x1_proj_conv: true + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + aspp_use_only_1x1_proj_conv: true + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0001 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 32 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/panoptic_deeplab/mobilenet_v3_large_os32.textproto b/configs/cityscapes/panoptic_deeplab/mobilenet_v3_large_os32.textproto new file mode 100644 index 0000000000000000000000000000000000000000..c9bc507dce44a8399f78823410804b5134cfdf59 --- /dev/null +++ b/configs/cityscapes/panoptic_deeplab/mobilenet_v3_large_os32.textproto @@ -0,0 +1,156 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with MobilenetV3-Large model and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# References: +# +# For Mobilenet V3, see +# - Andrew Howard, et al. "Searching for MobileNetV3" In ICCV, 2019. +# +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "mobilenet_v3_large" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0004 + training_number_of_steps: 30000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/panoptic_deeplab/mobilenet_v3_small_os32.textproto b/configs/cityscapes/panoptic_deeplab/mobilenet_v3_small_os32.textproto new file mode 100644 index 0000000000000000000000000000000000000000..c6c797ee94e4a4b4b2f7aa642c4d0cf87fdf810c --- /dev/null +++ b/configs/cityscapes/panoptic_deeplab/mobilenet_v3_small_os32.textproto @@ -0,0 +1,156 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with MobilenetV3-Small model and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# References: +# +# For Mobilenet V3, see +# - Andrew Howard, et al. "Searching for MobileNetV3" In ICCV, 2019. +# +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "mobilenet_v3_small" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0004 + training_number_of_steps: 30000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/panoptic_deeplab/resnet50_beta_os32.textproto b/configs/cityscapes/panoptic_deeplab/resnet50_beta_os32.textproto new file mode 100644 index 0000000000000000000000000000000000000000..431a71e21702edcc953788e65ae3af25e1acd63b --- /dev/null +++ b/configs/cityscapes/panoptic_deeplab/resnet50_beta_os32.textproto @@ -0,0 +1,158 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with ResNet-50-beta model variant and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# The `resnet50_beta` model variant replaces the first 7x7 convolutions in the +# original `resnet50` with three 3x3 convolutions, which is useful for dense +# prediction tasks. +# +# References: +# For resnet-50-beta, see +# https://github.com/tensorflow/models/blob/master/research/deeplab/core/resnet_v1_beta.py +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50_beta" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.00025 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 32 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/panoptic_deeplab/resnet50_os32_merge_with_pure_tf_func.textproto b/configs/cityscapes/panoptic_deeplab/resnet50_os32_merge_with_pure_tf_func.textproto new file mode 100644 index 0000000000000000000000000000000000000000..49a0f495856554aa623cf9a6711ef50296677355 --- /dev/null +++ b/configs/cityscapes/panoptic_deeplab/resnet50_os32_merge_with_pure_tf_func.textproto @@ -0,0 +1,161 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with ResNet-50 and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# This config provides an example to launch GPU training with +# `merge_semantic_and_instance_with_tf_op` = false, which will NOT invoke +# our efficient merging operation. For faster inference speed, please +# compile the provided `tensorflow_ops` and then set +# `merge_semantic_and_instance_with_tf_op` to true. +# +# References: +# For ResNet, see +# - Kaiming He, et al. "Deep Residual Learning for Image Recognition." +# In CVPR, 2016. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.00025 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 8 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_1_os16.textproto b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_1_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..944b6650a8128b90eb3382ca2147aca62bc2429c --- /dev/null +++ b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_1_os16.textproto @@ -0,0 +1,166 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with SWideRNet-SAC-(1, 1, 1) and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# SWideRNet-SAC-(1, 1, 1) employs the Switchable Atrous Convolution (SAC) +# in the last stage of network backbone. +# +# References: +# For SAC, see +# - Siyuan Qiao, et al. "DetectoRS: Detecting Objects with Recursive +# Feature Pyramid and Switchable Atrous Convolution." In CVPR, 2021. +# For SWideRNet, see +# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for +# Panoptic Segmentation." arXiv: 2011.11675. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast +# Baseline for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "swidernet" + output_stride: 16 + stem_width_multiplier: 1 + backbone_width_multiplier: 1 + backbone_layer_multiplier: 1 + use_sac_beyond_stride: 32 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0001 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 32 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_3_os16.textproto b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_3_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..8eec2ad45fb69717c1c216e5f07f86a236c7493d --- /dev/null +++ b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_3_os16.textproto @@ -0,0 +1,167 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with SWideRNet-SAC-(1, 1, 3) and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# SWideRNet-SAC-(1, 1, 3) employs the Switchable Atrous Convolution (SAC) +# in the last stage of network backbone. +# +# References: +# For SAC, see +# - Siyuan Qiao, et al. "DetectoRS: Detecting Objects with Recursive +# Feature Pyramid and Switchable Atrous Convolution." In CVPR, 2021. +# For SWideRNet, see +# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for +# Panoptic Segmentation." arXiv: 2011.11675. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast +# Baseline for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "swidernet" + output_stride: 16 + stem_width_multiplier: 1 + backbone_width_multiplier: 1 + backbone_layer_multiplier: 3 + use_sac_beyond_stride: 32 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0001 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 32 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} + diff --git a/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_4.5_os16.textproto b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_4.5_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..fcda36d90977edd7164ccc0989c62ed796955d56 --- /dev/null +++ b/configs/cityscapes/panoptic_deeplab/swidernet_sac_1_1_4.5_os16.textproto @@ -0,0 +1,166 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with SWideRNet-SAC-(1, 1, 4.5) and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# SWideRNet-SAC-(1, 1, 4.5) employs the Switchable Atrous Convolution (SAC) +# in the last stage of network backbone. +# +# References: +# For SAC, see +# - Siyuan Qiao, et al. "DetectoRS: Detecting Objects with Recursive +# Feature Pyramid and Switchable Atrous Convolution." In CVPR, 2021. +# For SWideRNet, see +# - Liang-Chieh Chen, et al. "Scaling Wide Residual Networks for +# Panoptic Segmentation." arXiv: 2011.11675. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast +# Baseline for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "swidernet" + output_stride: 16 + stem_width_multiplier: 1 + backbone_width_multiplier: 1 + backbone_layer_multiplier: 4.5 + use_sac_beyond_stride: 32 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.00025 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 32 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes/panoptic_deeplab/wide_resnet41_os16.textproto b/configs/cityscapes/panoptic_deeplab/wide_resnet41_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..f04b18b78c2c1aa56182cc0e5d2950389be2d15b --- /dev/null +++ b/configs/cityscapes/panoptic_deeplab/wide_resnet41_os16.textproto @@ -0,0 +1,162 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with Wide ResNet-41 and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# Wide ResNet-41 improves over Wide ResNet-38 by (1) removing the last residual +# block, and (2) repeating the second last residual block two more times. +# +# References: +# For Wide ResNet-38, see +# - Zifeng Wu, et al. "Wider or deeper: Revisiting the ResNet model for +# visual recognition." Pattern Recognition, 2019. +# For Wide ResNet-41, see +# - Liang-Chieh Chen, et al. "Naive-Student: Leveraging Semi-Supervised +# Learning in Video Sequences for Urban Scene Segmentation.", In ECCV, 2020. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast +# Baseline for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "wide_resnet41" + output_stride: 16 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0001 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 32 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "cityscapes_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/cityscapes_dvps/vip_deeplab/resnet50_beta_os32.textproto b/configs/cityscapes_dvps/vip_deeplab/resnet50_beta_os32.textproto new file mode 100644 index 0000000000000000000000000000000000000000..825671ab92b9d1fcb4abdc132d1f9f8f1e1cba05 --- /dev/null +++ b/configs/cityscapes_dvps/vip_deeplab/resnet50_beta_os32.textproto @@ -0,0 +1,168 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# ViP-DeepLab with ResNet-50-beta model variant and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# The `resnet50_beta` model variant replaces the first 7x7 convolutions in the +# original `resnet50` with three 3x3 convolutions, which is useful for dense +# prediction tasks. +# +# References: +# For resnet-50-beta, see +# https://github.com/tensorflow/models/blob/master/research/deeplab/core/resnet_v1_beta.py +# For ViP-DeepLab, see +# - Siyuan Qiao, et al. "ViP-DeepLab: Learning Visual Perception with +# Depth-aware Video Panoptic Segmentation" In CVPR, 2021. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50_beta" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + vip_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + next_regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + next_regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.00003125 + training_number_of_steps: 60000 + } +} +train_dataset_options { + dataset: "cityscapes_dvps" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/fag.md. + batch_size: 4 + crop_size: 513 + crop_size: 1025 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 + use_next_frame: true +} +eval_dataset_options { + dataset: "cityscapes_dvps" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 + use_next_frame: true +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_100k.textproto b/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_100k.textproto new file mode 100644 index 0000000000000000000000000000000000000000..aa9059adb8101283312cb39535837258e810c411 --- /dev/null +++ b/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_100k.textproto @@ -0,0 +1,137 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# MaX-DeepLab-S with resolution 1025x1025 and 100k training steps. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +######################################################################### +# +# MaX-DeepLab-S replaces the last two stages of ResNet-50-beta with axial- +# attention blocks and applies a small dual-path transformer. +# +# For axial-attention, see +# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic +# Segmentation." In ECCV, 2020. +# For MaX-DeepLab, see +# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with Mask +# Transformers." In CVPR, 2021. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "max_deeplab_s" + output_stride: 16 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "feature_semantic" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + max_deeplab { + pixel_space_head { + output_channels: 128 + head_channels: 256 + } + auxiliary_low_level { + feature_key: "res3" + channels_project: 64 + } + auxiliary_low_level { + feature_key: "res2" + channels_project: 32 + } + auxiliary_semantic_head { + output_channels: 134 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + } + pq_style_loss { + weight: 3.0 + } + mask_id_cross_entropy_loss { + weight: 0.3 + } + instance_discrimination_loss { + weight: 1.0 + } + } + solver_options { + base_learning_rate: 0.001 + training_number_of_steps: 100000 + warmup_steps: 5000 + backbone_learning_rate_multiplier: 0.1 + } +} +train_dataset_options { + dataset: "coco_panoptic" + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 1025 + crop_size: 1025 + min_resize_value: 1025 + max_resize_value: 1025 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 1.5 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: false + small_instance_weight: 1.0 + # This option generates ground truth labels for MaX-Deeplab. + thing_id_mask_annotations: true +} +eval_dataset_options { + dataset: "coco_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 1025 + min_resize_value: 1025 + max_resize_value: 1025 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: false + small_instance_weight: 1.0 + # This option generates ground truth labels for MaX-Deeplab. + thing_id_mask_annotations: true +} +evaluator_options { + continuous_eval_timeout: 43200 + thing_area_limit: 256 + stuff_area_limit: 4096 + transformer_class_confidence_threshold: 0.7 + pixel_confidence_threshold: 0.4 + save_predictions: true + save_raw_predictions: false + # Some options are inapplicable to MaX-DeepLab, including nms_kernel, + # merge_semantic_and_instance_with_tf_op, center_score_threshold, + # keep_k_centers, add_flipped_images, and eval_scales. +} diff --git a/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_200k.textproto b/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_200k.textproto new file mode 100644 index 0000000000000000000000000000000000000000..a15a5b6dbd139895277a6c515e87f518853415b1 --- /dev/null +++ b/configs/coco/max_deeplab/max_deeplab_s_os16_res1025_200k.textproto @@ -0,0 +1,137 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# MaX-DeepLab-S with resolution 1025x1025 and 200k training steps. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +######################################################################### +# +# MaX-DeepLab-S replaces the last two stages of ResNet-50-beta with axial- +# attention blocks and applies a small dual-path transformer. +# +# For axial-attention, see +# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic +# Segmentation." In ECCV, 2020. +# For MaX-DeepLab, see +# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with Mask +# Transformers." In CVPR, 2021. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "max_deeplab_s" + output_stride: 16 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "feature_semantic" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + max_deeplab { + pixel_space_head { + output_channels: 128 + head_channels: 256 + } + auxiliary_low_level { + feature_key: "res3" + channels_project: 64 + } + auxiliary_low_level { + feature_key: "res2" + channels_project: 32 + } + auxiliary_semantic_head { + output_channels: 134 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + } + pq_style_loss { + weight: 3.0 + } + mask_id_cross_entropy_loss { + weight: 0.3 + } + instance_discrimination_loss { + weight: 1.0 + } + } + solver_options { + base_learning_rate: 0.001 + training_number_of_steps: 200000 + warmup_steps: 5000 + backbone_learning_rate_multiplier: 0.1 + } +} +train_dataset_options { + dataset: "coco_panoptic" + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 1025 + crop_size: 1025 + min_resize_value: 1025 + max_resize_value: 1025 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 1.5 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: false + small_instance_weight: 1.0 + # This option generates ground truth labels for MaX-Deeplab. + thing_id_mask_annotations: true +} +eval_dataset_options { + dataset: "coco_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1025 + crop_size: 1025 + min_resize_value: 1025 + max_resize_value: 1025 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: false + small_instance_weight: 1.0 + # This option generates ground truth labels for MaX-Deeplab. + thing_id_mask_annotations: true +} +evaluator_options { + continuous_eval_timeout: 43200 + thing_area_limit: 256 + stuff_area_limit: 4096 + transformer_class_confidence_threshold: 0.7 + pixel_confidence_threshold: 0.4 + save_predictions: true + save_raw_predictions: false + # Some options are inapplicable to MaX-DeepLab, including nms_kernel, + # merge_semantic_and_instance_with_tf_op, center_score_threshold, + # keep_k_centers, add_flipped_images, and eval_scales. +} diff --git a/configs/coco/max_deeplab/max_deeplab_s_os16_res641_100k.textproto b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_100k.textproto new file mode 100644 index 0000000000000000000000000000000000000000..c6c385757b16a2e0801f03bf56dcdd2ad78b187d --- /dev/null +++ b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_100k.textproto @@ -0,0 +1,137 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# MaX-DeepLab-S with resolution 641x641 and 100k training steps. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +######################################################################### +# +# MaX-DeepLab-S replaces the last two stages of ResNet-50-beta with axial- +# attention blocks and applies a small dual-path transformer. +# +# For axial-attention, see +# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic +# Segmentation." In ECCV, 2020. +# For MaX-DeepLab, see +# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with Mask +# Transformers." In CVPR, 2021. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "max_deeplab_s" + output_stride: 16 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "feature_semantic" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + max_deeplab { + pixel_space_head { + output_channels: 128 + head_channels: 256 + } + auxiliary_low_level { + feature_key: "res3" + channels_project: 64 + } + auxiliary_low_level { + feature_key: "res2" + channels_project: 32 + } + auxiliary_semantic_head { + output_channels: 134 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + } + pq_style_loss { + weight: 3.0 + } + mask_id_cross_entropy_loss { + weight: 0.3 + } + instance_discrimination_loss { + weight: 1.0 + } + } + solver_options { + base_learning_rate: 0.001 + training_number_of_steps: 100000 + warmup_steps: 5000 + backbone_learning_rate_multiplier: 0.1 + } +} +train_dataset_options { + dataset: "coco_panoptic" + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 1.5 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: false + small_instance_weight: 1.0 + # This option generates ground truth labels for MaX-Deeplab. + thing_id_mask_annotations: true +} +eval_dataset_options { + dataset: "coco_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: false + small_instance_weight: 1.0 + # This option generates ground truth labels for MaX-Deeplab. + thing_id_mask_annotations: true +} +evaluator_options { + continuous_eval_timeout: 43200 + thing_area_limit: 100 + stuff_area_limit: 1600 + transformer_class_confidence_threshold: 0.7 + pixel_confidence_threshold: 0.4 + save_predictions: true + save_raw_predictions: false + # Some options are inapplicable to MaX-DeepLab, including nms_kernel, + # merge_semantic_and_instance_with_tf_op, center_score_threshold, + # keep_k_centers, add_flipped_images, and eval_scales. +} diff --git a/configs/coco/max_deeplab/max_deeplab_s_os16_res641_200k.textproto b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_200k.textproto new file mode 100644 index 0000000000000000000000000000000000000000..3261da40abb2be7c980760c70be183dc63e7255b --- /dev/null +++ b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_200k.textproto @@ -0,0 +1,137 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# MaX-DeepLab-S with resolution 641x641 and 200k training steps. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +######################################################################### +# +# MaX-DeepLab-S replaces the last two stages of ResNet-50-beta with axial- +# attention blocks and applies a small dual-path transformer. +# +# For axial-attention, see +# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic +# Segmentation." In ECCV, 2020. +# For MaX-DeepLab, see +# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with Mask +# Transformers." In CVPR, 2021. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "max_deeplab_s" + output_stride: 16 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "feature_semantic" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + max_deeplab { + pixel_space_head { + output_channels: 128 + head_channels: 256 + } + auxiliary_low_level { + feature_key: "res3" + channels_project: 64 + } + auxiliary_low_level { + feature_key: "res2" + channels_project: 32 + } + auxiliary_semantic_head { + output_channels: 134 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + } + pq_style_loss { + weight: 3.0 + } + mask_id_cross_entropy_loss { + weight: 0.3 + } + instance_discrimination_loss { + weight: 1.0 + } + } + solver_options { + base_learning_rate: 0.001 + training_number_of_steps: 200000 + warmup_steps: 5000 + backbone_learning_rate_multiplier: 0.1 + } +} +train_dataset_options { + dataset: "coco_panoptic" + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 1.5 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: false + small_instance_weight: 1.0 + # This option generates ground truth labels for MaX-Deeplab. + thing_id_mask_annotations: true +} +eval_dataset_options { + dataset: "coco_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: false + small_instance_weight: 1.0 + # This option generates ground truth labels for MaX-Deeplab. + thing_id_mask_annotations: true +} +evaluator_options { + continuous_eval_timeout: 43200 + thing_area_limit: 100 + stuff_area_limit: 1600 + transformer_class_confidence_threshold: 0.7 + pixel_confidence_threshold: 0.4 + save_predictions: true + save_raw_predictions: false + # Some options are inapplicable to MaX-DeepLab, including nms_kernel, + # merge_semantic_and_instance_with_tf_op, center_score_threshold, + # keep_k_centers, add_flipped_images, and eval_scales. +} diff --git a/configs/coco/max_deeplab/max_deeplab_s_os16_res641_400k.textproto b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_400k.textproto new file mode 100644 index 0000000000000000000000000000000000000000..6e5cb1a9adb6587a38041f085073751910d52508 --- /dev/null +++ b/configs/coco/max_deeplab/max_deeplab_s_os16_res641_400k.textproto @@ -0,0 +1,137 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# MaX-DeepLab-S with resolution 641x641 and 400k training steps. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +######################################################################### +# +# MaX-DeepLab-S replaces the last two stages of ResNet-50-beta with axial- +# attention blocks and applies a small dual-path transformer. +# +# For axial-attention, see +# - Huiyu Wang, et al. "Axial-DeepLab: Stand-Alone Axial-Attention for Panoptic +# Segmentation." In ECCV, 2020. +# For MaX-DeepLab, see +# - Huiyu Wang, et al. "MaX-DeepLab: End-to-End Panoptic Segmentation with Mask +# Transformers." In CVPR, 2021. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "max_deeplab_s" + output_stride: 16 + drop_path_keep_prob: 0.8 + drop_path_schedule: "linear" + } + decoder { + feature_key: "feature_semantic" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + max_deeplab { + pixel_space_head { + output_channels: 128 + head_channels: 256 + } + auxiliary_low_level { + feature_key: "res3" + channels_project: 64 + } + auxiliary_low_level { + feature_key: "res2" + channels_project: 32 + } + auxiliary_semantic_head { + output_channels: 134 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + } + pq_style_loss { + weight: 3.0 + } + mask_id_cross_entropy_loss { + weight: 0.3 + } + instance_discrimination_loss { + weight: 1.0 + } + } + solver_options { + base_learning_rate: 0.001 + training_number_of_steps: 400000 + warmup_steps: 5000 + backbone_learning_rate_multiplier: 0.1 + } +} +train_dataset_options { + dataset: "coco_panoptic" + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 1.5 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: false + small_instance_weight: 1.0 + # This option generates ground truth labels for MaX-Deeplab. + thing_id_mask_annotations: true +} +eval_dataset_options { + dataset: "coco_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: false + small_instance_weight: 1.0 + # This option generates ground truth labels for MaX-Deeplab. + thing_id_mask_annotations: true +} +evaluator_options { + continuous_eval_timeout: 43200 + thing_area_limit: 100 + stuff_area_limit: 1600 + transformer_class_confidence_threshold: 0.7 + pixel_confidence_threshold: 0.4 + save_predictions: true + save_raw_predictions: false + # Some options are inapplicable to MaX-DeepLab, including nms_kernel, + # merge_semantic_and_instance_with_tf_op, center_score_threshold, + # keep_k_centers, add_flipped_images, and eval_scales. +} diff --git a/configs/coco/panoptic_deeplab/resnet50_beta_os16.textproto b/configs/coco/panoptic_deeplab/resnet50_beta_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..08c575651e3975b7f84a5b18d49b9ff3e6f11711 --- /dev/null +++ b/configs/coco/panoptic_deeplab/resnet50_beta_os16.textproto @@ -0,0 +1,159 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with ResNet-50-beta model variant and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# The `resnet50_beta` model variant replaces the first 7x7 convolutions in the +# original `resnet50` with three 3x3 convolutions, which is useful for dense +# prediction tasks. +# +# References: +# For resnet-50-beta, see +# https://github.com/tensorflow/models/blob/master/research/deeplab/core/resnet_v1_beta.py +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50_beta" + output_stride: 16 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 134 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0005 + training_number_of_steps: 200000 + warmup_steps: 2000 + } +} +train_dataset_options { + dataset: "coco_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 1.5 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "coco_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 4096 + center_score_threshold: 0.1 + nms_kernel: 41 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} + diff --git a/configs/coco/panoptic_deeplab/resnet50_beta_os32.textproto b/configs/coco/panoptic_deeplab/resnet50_beta_os32.textproto new file mode 100644 index 0000000000000000000000000000000000000000..f4ad475800f8ef8ddc42123bd50cc8689f244dae --- /dev/null +++ b/configs/coco/panoptic_deeplab/resnet50_beta_os32.textproto @@ -0,0 +1,158 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with ResNet-50-beta model variant and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# The `resnet50_beta` model variant replaces the first 7x7 convolutions in the +# original `resnet50` with three 3x3 convolutions, which is useful for dense +# prediction tasks. +# +# References: +# For resnet-50-beta, see +# https://github.com/tensorflow/models/blob/master/research/deeplab/core/resnet_v1_beta.py +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50_beta" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 134 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0005 + training_number_of_steps: 200000 + warmup_steps: 2000 + } +} +train_dataset_options { + dataset: "coco_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 1.5 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "coco_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 4096 + center_score_threshold: 0.1 + nms_kernel: 41 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/coco/panoptic_deeplab/resnet50_os16.textproto b/configs/coco/panoptic_deeplab/resnet50_os16.textproto new file mode 100644 index 0000000000000000000000000000000000000000..c8749fbcd795a4346cfc4c893682535ff4bd1454 --- /dev/null +++ b/configs/coco/panoptic_deeplab/resnet50_os16.textproto @@ -0,0 +1,155 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with ResNet-50 and output stride 16. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# References: +# For ResNet, see +# - Kaiming He, et al. "Deep Residual Learning for Image Recognition." +# In CVPR, 2016. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50" + output_stride: 16 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 134 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0005 + training_number_of_steps: 200000 + warmup_steps: 2000 + } +} +train_dataset_options { + dataset: "coco_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 1.5 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "coco_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 4096 + center_score_threshold: 0.1 + nms_kernel: 41 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/coco/panoptic_deeplab/resnet50_os32.textproto b/configs/coco/panoptic_deeplab/resnet50_os32.textproto new file mode 100644 index 0000000000000000000000000000000000000000..5ebab1b352c1574c9ed5410a617905598f409c88 --- /dev/null +++ b/configs/coco/panoptic_deeplab/resnet50_os32.textproto @@ -0,0 +1,157 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with ResNet-50 and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# References: +# For ResNet, see +# - Kaiming He, et al. "Deep Residual Learning for Image Recognition." +# In CVPR, 2016. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint). + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 134 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0005 + training_number_of_steps: 200000 + warmup_steps: 2000 + } +} +train_dataset_options { + dataset: "coco_panoptic" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/faq.md. + batch_size: 64 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 1.5 + scale_factor_step_size: 0.1 + autoaugment_policy_name: "simple_classification_policy_magnitude_scale_0.2" + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "coco_panoptic" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 641 + crop_size: 641 + min_resize_value: 641 + max_resize_value: 641 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 4096 + center_score_threshold: 0.1 + nms_kernel: 41 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} + diff --git a/configs/example/example_cityscapes_deeplabv3.textproto b/configs/example/example_cityscapes_deeplabv3.textproto new file mode 100644 index 0000000000000000000000000000000000000000..4e29b9a745240114bf1baf85fb513424e14c10fe --- /dev/null +++ b/configs/example/example_cityscapes_deeplabv3.textproto @@ -0,0 +1,25 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions + +model_options { + decoder { + feature_key: "res5" + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + + backbone { + name: "resnet50" + } + + # Example for cityscapes. + deeplab_v3 { + num_classes: 19 + } +} + +train_dataset_options { + crop_size: 1025 + crop_size: 2049 +} diff --git a/configs/example/example_cityscapes_deeplabv3_mv3l.textproto b/configs/example/example_cityscapes_deeplabv3_mv3l.textproto new file mode 100644 index 0000000000000000000000000000000000000000..f190564ddc683bdbe7c62ddf5df556075b2b5a15 --- /dev/null +++ b/configs/example/example_cityscapes_deeplabv3_mv3l.textproto @@ -0,0 +1,26 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions + +model_options { + decoder { + feature_key: "res5" + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + + backbone { + name: "mobilenet_v3_large" + use_squeeze_and_excite: true + } + + # Example for cityscapes. + deeplab_v3 { + num_classes: 19 + } +} + +train_dataset_options { + crop_size: 1025 + crop_size: 2049 +} diff --git a/configs/example/example_cityscapes_deeplabv3plus.textproto b/configs/example/example_cityscapes_deeplabv3plus.textproto new file mode 100644 index 0000000000000000000000000000000000000000..eb79993563237f5de4f58014a1bfd928eb7c6c83 --- /dev/null +++ b/configs/example/example_cityscapes_deeplabv3plus.textproto @@ -0,0 +1,29 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions + +model_options { + decoder { + feature_key: "res5" + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + + backbone { + name: "resnet50" + } + + deeplab_v3_plus { + low_level { + feature_key: "res2" + channels_project: 48 + } + # Example for cityscapes. + num_classes: 19 + } +} + +train_dataset_options { + crop_size: 1025 + crop_size: 2049 +} diff --git a/configs/example/example_cityscapes_panoptic_deeplab.textproto b/configs/example/example_cityscapes_panoptic_deeplab.textproto new file mode 100644 index 0000000000000000000000000000000000000000..a06b9b696e8c30487b184a9b7b1a3c05c634992a --- /dev/null +++ b/configs/example/example_cityscapes_panoptic_deeplab.textproto @@ -0,0 +1,61 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions + +model_options { + decoder { + feature_key: "res5" + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + + backbone { + name: "resnet50" + } + + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + semantic_head { + # Example for cityscapes. + output_channels: 19 + head_channels: 256 + } + instance { + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + } +} + +train_dataset_options { + crop_size: 1025 + crop_size: 2049 +} diff --git a/configs/example/example_cityscapes_panoptic_deeplab_mv3l.textproto b/configs/example/example_cityscapes_panoptic_deeplab_mv3l.textproto new file mode 100644 index 0000000000000000000000000000000000000000..7ea7cae2e44d4edb20c3c5c685e6193350458cbe --- /dev/null +++ b/configs/example/example_cityscapes_panoptic_deeplab_mv3l.textproto @@ -0,0 +1,62 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions + +model_options { + decoder { + feature_key: "res5" + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + + backbone { + name: "mobilenet_v3_large" + use_squeeze_and_excite: true + } + + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + semantic_head { + # Example for cityscapes. + output_channels: 19 + head_channels: 256 + } + instance { + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + } +} + +train_dataset_options { + crop_size: 1025 + crop_size: 2049 +} diff --git a/configs/example/example_coco_max_deeplab.textproto b/configs/example/example_coco_max_deeplab.textproto new file mode 100644 index 0000000000000000000000000000000000000000..c53549d77dc60725f5bd1960ebd9e89931b316fc --- /dev/null +++ b/configs/example/example_coco_max_deeplab.textproto @@ -0,0 +1,41 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions + +model_options { + decoder { + feature_key: "feature_semantic" + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + + backbone { + name: "max_deeplab_s" + output_stride: 16 + } + + max_deeplab { + pixel_space_head { + output_channels: 128 + head_channels: 256 + } + auxiliary_low_level { + feature_key: "res3" + channels_project: 64 + } + auxiliary_low_level { + feature_key: "res2" + channels_project: 32 + } + auxiliary_semantic_head { + # Example for COCO. + output_channels: 134 + head_channels: 256 + } + } +} + +train_dataset_options { + crop_size: 65 + crop_size: 65 +} diff --git a/configs/example/example_kitti-step_motion_deeplab.textproto b/configs/example/example_kitti-step_motion_deeplab.textproto new file mode 100644 index 0000000000000000000000000000000000000000..383f8eaac3ba8d538e4578d6aca8f6129cead73f --- /dev/null +++ b/configs/example/example_kitti-step_motion_deeplab.textproto @@ -0,0 +1,60 @@ +# proto-file: deeplab2/model.proto +# proto-message: ModelOptions + +decoder { + feature_key: "res5" + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 +} + +backbone { + name: "resnet50" +} + +# Motion-Deeplab adopts Panoptic-Deeplab for the task of Video Panoptic +# Segmentation or Segmenting and Tracking Every Pixel (STEP). +motion_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + semantic_head { + # Example for KITTI-STEP. + output_channels: 19 + head_channels: 256 + } + instance { + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 6 + atrous_rates: 12 + atrous_rates: 18 + } + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + motion_head { + output_channels: 2 + head_channels: 32 + } +} diff --git a/configs/kitti/motion_deeplab/resnet50_os32.textproto b/configs/kitti/motion_deeplab/resnet50_os32.textproto new file mode 100644 index 0000000000000000000000000000000000000000..534e6c8b1cc07b200d7d0752f767c17a89fb5ada --- /dev/null +++ b/configs/kitti/motion_deeplab/resnet50_os32.textproto @@ -0,0 +1,168 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Motion-DeepLab with ResNet-50 and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# This config uses the Cityscapes pretrained checkpoint where crowd label is +# kept to pretrain the semantic segmentation branch. Additionally, we perform +# net surgery on the first 3x3 convolution to take two-frame inputs. +# +# References: +# For ResNet, see +# - Kaiming He, et al. "Deep Residual Learning for Image Recognition." +# In CVPR, 2016. +# For Motion-DeepLab, see +# - Mark Weber, et al. "STEP: Segmenting and Tracking Every Pixel." +# arXiv: 2102.11859. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint) + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + motion_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + motion_head { + output_channels: 2 + head_channels: 32 + } + } +} +trainer_options { + save_checkpoints_steps: 500 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + motion_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.0001 + training_number_of_steps: 50000 + } +} +train_dataset_options { + dataset: "kitti_step" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/fag.md. + batch_size: 32 + crop_size: 385 + crop_size: 1249 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 + use_two_frames: true +} +eval_dataset_options { + dataset: "kitti_step" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 385 + crop_size: 1249 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 + use_two_frames: true +} +evaluator_options { + continuous_eval_timeout: 21600 + stuff_area_limit: 0 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/kitti/motion_deeplab/resnet50_os32_trainval.textproto b/configs/kitti/motion_deeplab/resnet50_os32_trainval.textproto new file mode 100644 index 0000000000000000000000000000000000000000..2fb6c6a998d2af9cac805f31c1e9c3c1b4a049e6 --- /dev/null +++ b/configs/kitti/motion_deeplab/resnet50_os32_trainval.textproto @@ -0,0 +1,169 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Motion-DeepLab with ResNet-50 and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# This config uses the Cityscapes pretrained checkpoint where crowd label is +# kept to pretrain the semantic segmentation branch. Additionally, we perform +# net surgery on the first 3x3 convolution to take two-frame inputs. +# +# References: +# For ResNet, see +# - Kaiming He, et al. "Deep Residual Learning for Image Recognition." +# In CVPR, 2016. +# For Motion-DeepLab, see +# - Mark Weber, et al. "STEP: Segmenting and Tracking Every Pixel." +# arXiv: 2102.11859. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint) + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + motion_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + motion_head { + output_channels: 2 + head_channels: 32 + } + } +} +trainer_options { + save_checkpoints_steps: 500 + save_summaries_steps: 100 + steps_per_loop: 100 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + motion_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.00001 + training_number_of_steps: 50000 + } +} +train_dataset_options { + dataset: "kitti_step" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + file_pattern: "${VAL_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/fag.md. + batch_size: 32 + crop_size: 385 + crop_size: 1249 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 + use_two_frames: true +} +eval_dataset_options { + dataset: "kitti_step" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 385 + crop_size: 1249 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 + use_two_frames: true +} +evaluator_options { + continuous_eval_timeout: 21600 + stuff_area_limit: 0 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/kitti/panoptic_deeplab/resnet50_os32.textproto b/configs/kitti/panoptic_deeplab/resnet50_os32.textproto new file mode 100644 index 0000000000000000000000000000000000000000..7fcf81a82bc09ce5deab4534349ac8156b0197ca --- /dev/null +++ b/configs/kitti/panoptic_deeplab/resnet50_os32.textproto @@ -0,0 +1,159 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with ResNet-50 and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# This config uses the Cityscapes pretrained checkpoint where crowd label is +# kept to pretrain the semantic segmentation branch. +# +# References: +# For ResNet, see +# - Kaiming He, et al. "Deep Residual Learning for Image Recognition." +# In CVPR, 2016. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint) + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 500 + steps_per_loop: 500 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.00001 + training_number_of_steps: 30000 + } +} +train_dataset_options { + dataset: "kitti_step" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/fag.md. + batch_size: 32 + crop_size: 385 + crop_size: 1249 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "kitti_step" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 385 + crop_size: 1249 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 10000 + stuff_area_limit: 0 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + convert_raw_to_eval_ids: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/kitti/panoptic_deeplab/resnet50_os32_trainval.textproto b/configs/kitti/panoptic_deeplab/resnet50_os32_trainval.textproto new file mode 100644 index 0000000000000000000000000000000000000000..549eea5064b8346ee96022270dd0133d9cc15351 --- /dev/null +++ b/configs/kitti/panoptic_deeplab/resnet50_os32_trainval.textproto @@ -0,0 +1,160 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with ResNet-50 and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# This config uses the Cityscapes pretrained checkpoint where crowd label is +# kept to pretrain the semantic segmentation branch. +# +# References: +# For ResNet, see +# - Kaiming He, et al. "Deep Residual Learning for Image Recognition." +# In CVPR, 2016. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint) + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 19 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 1000 + save_summaries_steps: 500 + steps_per_loop: 500 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.000001 + training_number_of_steps: 30000 + } +} +train_dataset_options { + dataset: "kitti_step" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + file_pattern: "${VAL_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/fag.md. + batch_size: 32 + crop_size: 385 + crop_size: 1249 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "kitti_step" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 385 + crop_size: 1249 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 10000 + stuff_area_limit: 0 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + convert_raw_to_eval_ids: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/motchallenge/motion_deeplab/resnet50_os32.textproto b/configs/motchallenge/motion_deeplab/resnet50_os32.textproto new file mode 100644 index 0000000000000000000000000000000000000000..3b6ca505b02723119620db23f17f2d73280906fa --- /dev/null +++ b/configs/motchallenge/motion_deeplab/resnet50_os32.textproto @@ -0,0 +1,172 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Motion-DeepLab with ResNet-50 and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# This config uses the Cityscapes pretrained checkpoint where crowd label is +# kept to pretrain the semantic segmentation branch. Note that we additionally +# perform the net-surgery on the first convolution and the last prediction layer +# since (1) Motion-DeepLab takes two-frame as inputs, and (2) MOTChallenge-STEP +# contains a subeset of semantic classes of Cityscapes. For net-surgery details, +# see utils/net_surgery_convert_last_layer.py. +# +# References: +# For ResNet, see +# - Kaiming He, et al. "Deep Residual Learning for Image Recognition." +# In CVPR, 2016. +# For Motion-DeepLab, see +# - Mark Weber, et al. "STEP: Segmenting and Tracking Every Pixel." +# arXiv: 2102.11859. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint) + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + motion_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 7 + head_channels: 256 + } + motion_head { + output_channels: 2 + head_channels: 32 + } + } +} +trainer_options { + save_checkpoints_steps: 100 + save_summaries_steps: 50 + steps_per_loop: 50 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + motion_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.00001 + training_number_of_steps: 10000 + } +} +train_dataset_options { + dataset: "motchallenge_step" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/fag.md. + batch_size: 32 + crop_size: 1089 + crop_size: 1921 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 + use_two_frames: true +} +eval_dataset_options { + dataset: "motchallenge_step" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1089 + crop_size: 1921 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 + use_two_frames: true +} +evaluator_options { + continuous_eval_timeout: 10000 + stuff_area_limit: 0 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/configs/motchallenge/panoptic_deeplab/resnet50_os32.textproto b/configs/motchallenge/panoptic_deeplab/resnet50_os32.textproto new file mode 100644 index 0000000000000000000000000000000000000000..6d33cbcd210a9d25c298571b44d8ee30b3f7dcd2 --- /dev/null +++ b/configs/motchallenge/panoptic_deeplab/resnet50_os32.textproto @@ -0,0 +1,161 @@ +# proto-file: deeplab2/config.proto +# proto-message: ExperimentOptions +# +# Panoptic-DeepLab with ResNet-50 and output stride 32. +# +############### PLEASE READ THIS BEFORE USING THIS CONFIG ############### +# Before using this config, you need to update the following fields: +# - experiment_name: Use a unique experiment name for each experiment. +# - initial_checkpoint: Update the path to the initial checkpoint. +# - train_dataset_options.file_pattern: Update the path to the +# training set. e.g., your_dataset/train*.tfrecord +# - eval_dataset_options.file_pattern: Update the path to the +# validation set, e.g., your_dataset/eval*.tfrecord +# - (optional) set merge_semantic_and_instance_with_tf_op: true, if you +# could successfully compile the provided efficient merging operation +# under the folder `tensorflow_ops`. +######################################################################### +# +# This config uses the Cityscapes pretrained checkpoint where crowd label is +# kept to pretrain the semantic segmentation branch. Note that we additionally +# perform the net-surgery on the last prediction layer since MOTChallenge-STEP +# contains a subeset of semantic classes of Cityscapes. For net-surgery details, +# see utils/net_surgery_convert_last_layer.py. +# +# References: +# For ResNet, see +# - Kaiming He, et al. "Deep Residual Learning for Image Recognition." +# In CVPR, 2016. +# For Panoptic-DeepLab, see +# - Bowen Cheng, et al. "Panoptic-DeepLab: A Simple, Strong, and Fast Baseline +# for Bottom-Up Panoptic Segmentation." In CVPR, 2020. + +# Use a unique experiment_name for each experiment. +experiment_name: "${EXPERIMENT_NAME}" +model_options { + # Update the path to the initial checkpoint (e.g., ImageNet + # pretrained checkpoint) + initial_checkpoint: "${INIT_CHECKPOINT}" + backbone { + name: "resnet50" + output_stride: 32 + } + decoder { + feature_key: "res5" + decoder_channels: 256 + aspp_channels: 256 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + panoptic_deeplab { + low_level { + feature_key: "res3" + channels_project: 64 + } + low_level { + feature_key: "res2" + channels_project: 32 + } + instance { + low_level_override { + feature_key: "res3" + channels_project: 32 + } + low_level_override { + feature_key: "res2" + channels_project: 16 + } + instance_decoder_override { + feature_key: "res5" + decoder_channels: 128 + atrous_rates: 3 + atrous_rates: 6 + atrous_rates: 9 + } + center_head { + output_channels: 1 + head_channels: 32 + } + regression_head { + output_channels: 2 + head_channels: 32 + } + } + semantic_head { + output_channels: 7 + head_channels: 256 + } + } +} +trainer_options { + save_checkpoints_steps: 200 + save_summaries_steps: 50 + steps_per_loop: 50 + loss_options { + semantic_loss { + name: "softmax_cross_entropy" + weight: 1.0 + top_k_percent: 0.2 + } + center_loss { + name: "mse" + weight: 200 + } + regression_loss { + name: "l1" + weight: 0.01 + } + } + solver_options { + base_learning_rate: 0.00001 + training_number_of_steps: 10000 + } +} +train_dataset_options { + dataset: "motchallenge_step" + # Update the path to training set. + file_pattern: "${TRAIN_SET}" + # Adjust the batch_size accordingly to better fit your GPU/TPU memory. + # Also see Q1 in g3doc/fag.md. + batch_size: 32 + crop_size: 1089 + crop_size: 1921 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + augmentations { + min_scale_factor: 0.5 + max_scale_factor: 2.0 + scale_factor_step_size: 0.1 + } + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +eval_dataset_options { + dataset: "motchallenge_step" + # Update the path to validation set. + file_pattern: "${VAL_SET}" + batch_size: 1 + crop_size: 1089 + crop_size: 1921 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + # Add options to make the evaluation loss comparable to the training loss. + increase_small_instance_weights: true + small_instance_weight: 3.0 +} +evaluator_options { + continuous_eval_timeout: 10000 + stuff_area_limit: 0 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + # Use pure tf functions (i.e., no CUDA kernel) to merge semantic and + # instance maps. For faster speed, compile TensorFlow with provided kernel + # implementation under the folder `tensorflow_ops`, and set + # merge_semantic_and_instance_with_tf_op to true. + merge_semantic_and_instance_with_tf_op: false +} diff --git a/data/__init__.py b/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03 --- /dev/null +++ b/data/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/data/build_cityscapes_data.py b/data/build_cityscapes_data.py new file mode 100644 index 0000000000000000000000000000000000000000..a3001d3a5e52619b7e68f77e48d25440763e883f --- /dev/null +++ b/data/build_cityscapes_data.py @@ -0,0 +1,321 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Converts Cityscapes data to sharded TFRecord file format with Example protos. + +Please check ../g3doc/setup/cityscapes.md for instructions. +""" + +import collections +import json +import math +import os + +from absl import app +from absl import flags +from absl import logging +import numpy as np +import tensorflow as tf + +from deeplab2.data import data_utils +from deeplab2.data import dataset + +FLAGS = flags.FLAGS + +flags.DEFINE_string('cityscapes_root', None, 'Cityscapes dataset root folder.') + +flags.DEFINE_string('output_dir', None, + 'Path to save converted TFRecord of TensorFlow examples.') + +flags.DEFINE_boolean('create_panoptic_data', True, + 'Whether to create semantic or panoptic dataset.') + +flags.DEFINE_boolean('treat_crowd_as_ignore', True, + 'Whether to apply ignore labels to crowd pixels in ' + 'panoptic label.') + +_NUM_SHARDS = 10 +_SPLITS_TO_SIZES = dataset.CITYSCAPES_INFORMATION.splits_to_sizes +_IGNORE_LABEL = dataset.CITYSCAPES_PANOPTIC_INFORMATION.ignore_label +_CLASS_HAS_INSTANCE_LIST = dataset.CITYSCAPES_PANOPTIC_INFORMATION.class_has_instances_list +_PANOPTIC_LABEL_DIVISOR = dataset.CITYSCAPES_PANOPTIC_INFORMATION.panoptic_label_divisor + +# A map from data type to folder name that saves the data. +_FOLDERS_MAP = { + 'image': 'leftImg8bit', + 'label': 'gtFine', +} + +# A map from data type to filename postfix. +_POSTFIX_MAP = { + 'image': '_leftImg8bit', + 'label': '_gtFine_labelTrainIds', +} + +# A map from data type to data format. +_DATA_FORMAT_MAP = { + 'image': 'png', + 'label': 'png', +} +_PANOPTIC_LABEL_FORMAT = 'raw' + + +def _get_images(cityscapes_root, dataset_split): + """Gets files for the specified data type and dataset split. + + Args: + cityscapes_root: String, path to Cityscapes dataset root folder. + dataset_split: String, dataset split ('train', 'val', 'test') + + Returns: + A list of sorted file names or None when getting label for + test set. + """ + pattern = '*%s.%s' % (_POSTFIX_MAP['image'], _DATA_FORMAT_MAP['image']) + search_files = os.path.join( + cityscapes_root, _FOLDERS_MAP['image'], dataset_split, '*', pattern) + filenames = tf.io.gfile.glob(search_files) + return sorted(filenames) + + +def _split_image_path(image_path): + """Helper method to extract split paths from input image path. + + Args: + image_path: String, path to the image file. + + Returns: + A tuple of (cityscape root, dataset split, cityname and shared filename + prefix). + """ + image_path = os.path.normpath(image_path) + path_list = image_path.split(os.sep) + image_folder, dataset_split, city_name, file_name = path_list[-4:] + if image_folder != _FOLDERS_MAP['image']: + raise ValueError('Expects image path %s containing image folder.' + % image_path) + + pattern = '%s.%s' % (_POSTFIX_MAP['image'], _DATA_FORMAT_MAP['image']) + if not file_name.endswith(pattern): + raise ValueError('Image file name %s should end with %s' % + (file_name, pattern)) + + file_prefix = file_name[:-len(pattern)] + return os.sep.join(path_list[:-4]), dataset_split, city_name, file_prefix + + +def _get_semantic_annotation(image_path): + cityscapes_root, dataset_split, city_name, file_prefix = _split_image_path( + image_path) + semantic_annotation = '%s%s.%s' % (file_prefix, _POSTFIX_MAP['label'], + _DATA_FORMAT_MAP['label']) + return os.path.join(cityscapes_root, _FOLDERS_MAP['label'], dataset_split, + city_name, semantic_annotation) + + +def _get_panoptic_annotation(cityscapes_root, dataset_split, + annotation_file_name): + panoptic_folder = 'cityscapes_panoptic_%s_trainId' % dataset_split + return os.path.join(cityscapes_root, _FOLDERS_MAP['label'], panoptic_folder, + annotation_file_name) + + +def _read_segments(cityscapes_root, dataset_split): + """Reads segments information from json file. + + Args: + cityscapes_root: String, path to Cityscapes dataset root folder. + dataset_split: String, dataset split. + + Returns: + segments_dict: A dictionary that maps `image_id` (common file prefix) to + a tuple of (panoptic annotation file name, segments). Please refer to + _generate_panoptic_label() method on the detail structure of `segments`. + """ + json_filename = os.path.join( + cityscapes_root, _FOLDERS_MAP['label'], + 'cityscapes_panoptic_%s_trainId.json' % dataset_split) + with tf.io.gfile.GFile(json_filename) as f: + panoptic_dataset = json.load(f) + + segments_dict = {} + for annotation in panoptic_dataset['annotations']: + image_id = annotation['image_id'] + if image_id in segments_dict: + raise ValueError('Image ID %s already exists' % image_id) + annotation_file_name = annotation['file_name'] + segments = annotation['segments_info'] + + segments_dict[image_id] = (annotation_file_name, segments) + return segments_dict + + +def _generate_panoptic_label(panoptic_annotation_file, segments): + """Creates panoptic label map from annotations. + + Args: + panoptic_annotation_file: String, path to panoptic annotation (populated + with `trainId`). + segments: A list of dictionaries containing information of every segment. + Read from panoptic_${DATASET_SPLIT}_trainId.json. This method consumes + the following fields in each dictionary: + - id: panoptic id + - category_id: semantic class id + - area: pixel area of this segment + - iscrowd: if this segment is crowd region + + Returns: + A 2D numpy int32 array with the same height / width with panoptic + annotation. Each pixel value represents its panoptic ID. Please refer to + ../g3doc/setup/cityscapes.md for more details about how panoptic ID is + assigned. + """ + with tf.io.gfile.GFile(panoptic_annotation_file, 'rb') as f: + panoptic_label = data_utils.read_image(f.read()) + + if panoptic_label.mode != 'RGB': + raise ValueError('Expect RGB image for panoptic label, gets %s' % + panoptic_label.mode) + + panoptic_label = np.array(panoptic_label, dtype=np.int32) + # Cityscapes panoptic map is created by: + # color = [segmentId % 256, segmentId // 256, segmentId // 256 // 256] + panoptic_label = np.dot(panoptic_label, [1, 256, 256 * 256]) + + semantic_label = np.ones_like(panoptic_label) * _IGNORE_LABEL + instance_label = np.zeros_like(panoptic_label) + # Running count of instances per semantic category. + instance_count = collections.defaultdict(int) + for segment in segments: + selected_pixels = panoptic_label == segment['id'] + pixel_area = np.sum(selected_pixels) + if pixel_area != segment['area']: + raise ValueError('Expect %d pixels for segment %s, gets %d.' % + (segment['area'], segment, pixel_area)) + + category_id = segment['category_id'] + semantic_label[selected_pixels] = category_id + + if category_id in _CLASS_HAS_INSTANCE_LIST: + if segment['iscrowd']: + # Cityscapes crowd pixels will have instance ID of 0. + if FLAGS.treat_crowd_as_ignore: + semantic_label[selected_pixels] = _IGNORE_LABEL + continue + # Non-crowd pixels will have instance ID starting from 1. + instance_count[category_id] += 1 + if instance_count[category_id] >= _PANOPTIC_LABEL_DIVISOR: + raise ValueError('Too many instances for category %d in this image.' % + category_id) + instance_label[selected_pixels] = instance_count[category_id] + elif segment['iscrowd']: + raise ValueError('Stuff class should not have `iscrowd` label.') + + panoptic_label = semantic_label * _PANOPTIC_LABEL_DIVISOR + instance_label + return panoptic_label.astype(np.int32) + + +def _convert_split_name(dataset_split): + return dataset_split + '_fine' + + +def _create_semantic_label(image_path): + """Creates labels for semantic segmentation.""" + with tf.io.gfile.GFile(_get_semantic_annotation(image_path), 'rb') as f: + label_data = f.read() + + return label_data, _DATA_FORMAT_MAP['label'] + + +def _create_panoptic_label(image_path, segments_dict): + """Creates labels for panoptic segmentation.""" + cityscapes_root, dataset_split, _, file_prefix = _split_image_path(image_path) + + annotation_file_name, segments = segments_dict[file_prefix] + panoptic_annotation_file = _get_panoptic_annotation(cityscapes_root, + dataset_split, + annotation_file_name) + + panoptic_label = _generate_panoptic_label(panoptic_annotation_file, segments) + return panoptic_label.tostring(), _PANOPTIC_LABEL_FORMAT + + +def _convert_dataset(cityscapes_root, dataset_split, output_dir): + """Converts the specified dataset split to TFRecord format. + + Args: + cityscapes_root: String, path to Cityscapes dataset root folder. + dataset_split: String, the dataset split (one of `train`, `val` and `test`). + output_dir: String, directory to write output TFRecords to. + + Raises: + RuntimeError: If loaded image and label have different shape, or if the + image file with specified postfix could not be found. + """ + image_files = _get_images(cityscapes_root, dataset_split) + + num_images = len(image_files) + expected_dataset_size = _SPLITS_TO_SIZES[_convert_split_name(dataset_split)] + if num_images != expected_dataset_size: + raise ValueError('Expects %d images, gets %d' % + (expected_dataset_size, num_images)) + + segments_dict = None + if FLAGS.create_panoptic_data: + segments_dict = _read_segments(FLAGS.cityscapes_root, dataset_split) + + num_per_shard = int(math.ceil(len(image_files) / _NUM_SHARDS)) + + for shard_id in range(_NUM_SHARDS): + shard_filename = '%s-%05d-of-%05d.tfrecord' % ( + dataset_split, shard_id, _NUM_SHARDS) + output_filename = os.path.join(output_dir, shard_filename) + with tf.io.TFRecordWriter(output_filename) as tfrecord_writer: + start_idx = shard_id * num_per_shard + end_idx = min((shard_id + 1) * num_per_shard, num_images) + for i in range(start_idx, end_idx): + # Read the image. + with tf.io.gfile.GFile(image_files[i], 'rb') as f: + image_data = f.read() + + if dataset_split == 'test': + label_data, label_format = None, None + elif FLAGS.create_panoptic_data: + label_data, label_format = _create_panoptic_label( + image_files[i], segments_dict) + else: + label_data, label_format = _create_semantic_label(image_files[i]) + + # Convert to tf example. + _, _, _, file_prefix = _split_image_path(image_files[i]) + example = data_utils.create_tfexample(image_data, + _DATA_FORMAT_MAP['image'], + file_prefix, label_data, + label_format) + + tfrecord_writer.write(example.SerializeToString()) + + +def main(unused_argv): + tf.io.gfile.makedirs(FLAGS.output_dir) + + for dataset_split in ('train', 'val', 'test'): + logging.info('Starts to processing dataset split %s.', dataset_split) + _convert_dataset(FLAGS.cityscapes_root, dataset_split, FLAGS.output_dir) + + +if __name__ == '__main__': + flags.mark_flags_as_required(['cityscapes_root', 'output_dir']) + app.run(main) diff --git a/data/build_cityscapes_data_test.py b/data/build_cityscapes_data_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9707f89366f79753f93bfdd6e217f20456ffb978 --- /dev/null +++ b/data/build_cityscapes_data_test.py @@ -0,0 +1,67 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for build_cityscapes_data.""" + +import os + +from absl import flags +import numpy as np +from PIL import Image +import tensorflow as tf + +from deeplab2.data import build_cityscapes_data + + +FLAGS = flags.FLAGS +_TEST_DATA_DIR = 'deeplab2/data/testdata' +_TEST_FILE_PREFIX = 'dummy_000000_000000' + + +class BuildCityscapesDataTest(tf.test.TestCase): + + def test_read_segments(self): + cityscapes_root = os.path.join(_TEST_DATA_DIR) + segments_dict = build_cityscapes_data._read_segments( + cityscapes_root, dataset_split='dummy') + self.assertIn(_TEST_FILE_PREFIX, segments_dict) + _, segments = segments_dict[_TEST_FILE_PREFIX] + self.assertLen(segments, 10) + + def test_generate_panoptic_label(self): + FLAGS.treat_crowd_as_ignore = False # Test a more complicated setting + cityscapes_root = os.path.join(_TEST_DATA_DIR) + segments_dict = build_cityscapes_data._read_segments( + cityscapes_root, dataset_split='dummy') + annotation_file_name, segments = segments_dict[_TEST_FILE_PREFIX] + panoptic_annotation_file = build_cityscapes_data._get_panoptic_annotation( + cityscapes_root, dataset_split='dummy', + annotation_file_name=annotation_file_name) + panoptic_label = build_cityscapes_data._generate_panoptic_label( + panoptic_annotation_file, segments) + + # Check panoptic label matches golden file. + golden_file_path = os.path.join(_TEST_DATA_DIR, + 'dummy_gt_for_vps.png') + with tf.io.gfile.GFile(golden_file_path, 'rb') as f: + golden_label = Image.open(f) + # The PNG file is encoded by: + # color = [segmentId % 256, segmentId // 256, segmentId // 256 // 256] + golden_label = np.dot(np.asarray(golden_label), [1, 256, 256 * 256]) + + np.testing.assert_array_equal(panoptic_label, golden_label) + +if __name__ == '__main__': + tf.test.main() diff --git a/data/build_coco_data.py b/data/build_coco_data.py new file mode 100644 index 0000000000000000000000000000000000000000..6ae9176e78608f12e86000686baac209f903053e --- /dev/null +++ b/data/build_coco_data.py @@ -0,0 +1,309 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Converts COCO data to sharded TFRecord file format with Example protos. + +Please check + ../g3doc/setup/coco.md +for instructions. +""" + +import collections +import json +import math +import os + +from typing import Sequence, Tuple, Any + +from absl import app +from absl import flags +from absl import logging +import numpy as np +import tensorflow as tf + +from deeplab2.data import coco_constants +from deeplab2.data import data_utils +from deeplab2.data import dataset + +FLAGS = flags.FLAGS + +flags.DEFINE_string('coco_root', None, 'coco dataset root folder.') + +flags.DEFINE_string('output_dir', None, + 'Path to save converted TFRecord of TensorFlow examples.') + +flags.DEFINE_boolean('treat_crowd_as_ignore', True, + 'Whether to apply ignore labels to crowd pixels in ' + 'panoptic label.') + +_NUM_SHARDS = 1000 + + +_SPLITS_TO_SIZES = dataset.COCO_PANOPTIC_INFORMATION.splits_to_sizes +_IGNORE_LABEL = dataset.COCO_PANOPTIC_INFORMATION.ignore_label +_CLASS_HAS_INSTANCE_LIST = dataset.COCO_PANOPTIC_INFORMATION.class_has_instances_list +_PANOPTIC_LABEL_DIVISOR = dataset.COCO_PANOPTIC_INFORMATION.panoptic_label_divisor +_CLASS_MAPPING = coco_constants.get_id_mapping() + +# A map from data type to folder name that saves the data. +_FOLDERS_MAP = { + 'train': { + 'image': 'train2017', + 'label': 'annotations', + }, + 'val': { + 'image': 'val2017', + 'label': 'annotations', + }, + 'test': { + 'image': 'test2017', + 'label': '', + } +} + +# A map from data type to data format. +_DATA_FORMAT_MAP = { + 'image': 'jpg', + 'label': 'png', +} +_PANOPTIC_LABEL_FORMAT = 'raw' + + +def _get_images(coco_root: str, dataset_split: str) -> Sequence[str]: + """Gets files for the specified data type and dataset split. + + Args: + coco_root: String, path to coco dataset root folder. + dataset_split: String, dataset split ('train', 'val', 'test'). + + Returns: + A list of sorted file names. + """ + pattern = '*.%s' % _DATA_FORMAT_MAP['image'] + search_files = os.path.join( + coco_root, _FOLDERS_MAP[dataset_split]['image'], pattern) + filenames = tf.io.gfile.glob(search_files) + return sorted(filenames) + + +def _get_panoptic_annotation(coco_root: str, dataset_split: str, + annotation_file_name: str) -> str: + panoptic_folder = 'panoptic_%s2017' % dataset_split + return os.path.join(coco_root, _FOLDERS_MAP[dataset_split]['label'], + panoptic_folder, annotation_file_name) + + +def _read_segments(coco_root: str, dataset_split: str): + """Reads segments information from json file. + + Args: + coco_root: String, path to coco dataset root folder. + dataset_split: String, dataset split. + + Returns: + segments_dict: A dictionary that maps file prefix of annotation_file_name to + a tuple of (panoptic annotation file name, segments). Please refer to + _generate_panoptic_label() method on the detail structure of `segments`. + + Raises: + ValueError: If found duplicated image id in annotations. + """ + json_filename = os.path.join( + coco_root, _FOLDERS_MAP[dataset_split]['label'], + 'panoptic_%s2017.json' % dataset_split) + with tf.io.gfile.GFile(json_filename) as f: + panoptic_dataset = json.load(f) + + segments_dict = {} + for annotation in panoptic_dataset['annotations']: + image_id = annotation['image_id'] + if image_id in segments_dict: + raise ValueError('Image ID %s already exists' % image_id) + annotation_file_name = annotation['file_name'] + segments = annotation['segments_info'] + + segments_dict[os.path.splitext(annotation_file_name)[-2]] = ( + annotation_file_name, segments) + + return segments_dict + + +def _generate_panoptic_label(panoptic_annotation_file: str, segments: + Any) -> np.ndarray: + """Creates panoptic label map from annotations. + + Args: + panoptic_annotation_file: String, path to panoptic annotation. + segments: A list of dictionaries containing information of every segment. + Read from panoptic_${DATASET_SPLIT}2017.json. This method consumes + the following fields in each dictionary: + - id: panoptic id + - category_id: semantic class id + - area: pixel area of this segment + - iscrowd: if this segment is crowd region + + Returns: + A 2D numpy int32 array with the same height / width with panoptic + annotation. Each pixel value represents its panoptic ID. Please refer to + g3doc/setup/coco.md for more details about how panoptic ID is assigned. + """ + with tf.io.gfile.GFile(panoptic_annotation_file, 'rb') as f: + panoptic_label = data_utils.read_image(f.read()) + + if panoptic_label.mode != 'RGB': + raise ValueError('Expect RGB image for panoptic label, gets %s' % + panoptic_label.mode) + + panoptic_label = np.array(panoptic_label, dtype=np.int32) + # COCO panoptic map is created by: + # color = [segmentId % 256, segmentId // 256, segmentId // 256 // 256] + panoptic_label = np.dot(panoptic_label, [1, 256, 256 * 256]) + + semantic_label = np.ones_like(panoptic_label) * _IGNORE_LABEL + instance_label = np.zeros_like(panoptic_label) + # Running count of instances per semantic category. + instance_count = collections.defaultdict(int) + + for segment in segments: + selected_pixels = panoptic_label == segment['id'] + pixel_area = np.sum(selected_pixels) + if pixel_area != segment['area']: + raise ValueError('Expect %d pixels for segment %s, gets %d.' % + (segment['area'], segment, pixel_area)) + + category_id = segment['category_id'] + + # Map the category_id to contiguous ids + category_id = _CLASS_MAPPING[category_id] + + semantic_label[selected_pixels] = category_id + + if category_id in _CLASS_HAS_INSTANCE_LIST: + if segment['iscrowd']: + # COCO crowd pixels will have instance ID of 0. + if FLAGS.treat_crowd_as_ignore: + semantic_label[selected_pixels] = _IGNORE_LABEL + continue + # Non-crowd pixels will have instance ID starting from 1. + instance_count[category_id] += 1 + if instance_count[category_id] >= _PANOPTIC_LABEL_DIVISOR: + raise ValueError('Too many instances for category %d in this image.' % + category_id) + instance_label[selected_pixels] = instance_count[category_id] + elif segment['iscrowd']: + raise ValueError('Stuff class should not have `iscrowd` label.') + + panoptic_label = semantic_label * _PANOPTIC_LABEL_DIVISOR + instance_label + return panoptic_label.astype(np.int32) + + +def _create_panoptic_label(coco_root: str, dataset_split: str, image_path: str, + segments_dict: Any + ) -> Tuple[str, str]: + """Creates labels for panoptic segmentation. + + Args: + coco_root: String, path to coco dataset root folder. + dataset_split: String, dataset split ('train', 'val', 'test'). + image_path: String, path to the image file. + segments_dict: + Read from panoptic_${DATASET_SPLIT}2017.json. This method consumes + the following fields in each dictionary: + - id: panoptic id + - category_id: semantic class id + - area: pixel area of this segment + - iscrowd: if this segment is crowd region + + Returns: + A panoptic label where each pixel value represents its panoptic ID. + Please refer to g3doc/setup/coco.md for more details about howpanoptic ID + is assigned. + A string indicating label format in TFRecord. + """ + + image_path = os.path.normpath(image_path) + path_list = image_path.split(os.sep) + file_name = path_list[-1] + + annotation_file_name, segments = segments_dict[ + os.path.splitext(file_name)[-2]] + panoptic_annotation_file = _get_panoptic_annotation(coco_root, + dataset_split, + annotation_file_name) + + panoptic_label = _generate_panoptic_label(panoptic_annotation_file, segments) + return panoptic_label.tostring(), _PANOPTIC_LABEL_FORMAT + + +def _convert_dataset(coco_root: str, dataset_split: str, + output_dir: str) -> None: + """Converts the specified dataset split to TFRecord format. + + Args: + coco_root: String, path to coco dataset root folder. + dataset_split: String, the dataset split (one of `train`, `val` and `test`). + output_dir: String, directory to write output TFRecords to. + """ + image_files = _get_images(coco_root, dataset_split) + + num_images = len(image_files) + + if dataset_split != 'test': + segments_dict = _read_segments(coco_root, dataset_split) + + num_per_shard = int(math.ceil(len(image_files) / _NUM_SHARDS)) + + for shard_id in range(_NUM_SHARDS): + shard_filename = '%s-%05d-of-%05d.tfrecord' % ( + dataset_split, shard_id, _NUM_SHARDS) + output_filename = os.path.join(output_dir, shard_filename) + with tf.io.TFRecordWriter(output_filename) as tfrecord_writer: + start_idx = shard_id * num_per_shard + end_idx = min((shard_id + 1) * num_per_shard, num_images) + for i in range(start_idx, end_idx): + # Read the image. + with tf.io.gfile.GFile(image_files[i], 'rb') as f: + image_data = f.read() + + if dataset_split == 'test': + label_data, label_format = None, None + else: + label_data, label_format = _create_panoptic_label( + coco_root, dataset_split, image_files[i], segments_dict) + + # Convert to tf example. + image_path = os.path.normpath(image_files[i]) + path_list = image_path.split(os.sep) + file_name = path_list[-1] + file_prefix = file_name.replace(_DATA_FORMAT_MAP['image'], '') + example = data_utils.create_tfexample(image_data, + 'jpeg', + file_prefix, label_data, + label_format) + + tfrecord_writer.write(example.SerializeToString()) + + +def main(unused_argv: Sequence[str]) -> None: + tf.io.gfile.makedirs(FLAGS.output_dir) + + for dataset_split in ('train', 'val', 'test'): + logging.info('Starts processing dataset split %s.', dataset_split) + _convert_dataset(FLAGS.coco_root, dataset_split, FLAGS.output_dir) + + +if __name__ == '__main__': + flags.mark_flags_as_required(['coco_root', 'output_dir']) + app.run(main) diff --git a/data/build_coco_data_test.py b/data/build_coco_data_test.py new file mode 100644 index 0000000000000000000000000000000000000000..63f835ec7cac5b7c087f86548f0766f5b0c677a3 --- /dev/null +++ b/data/build_coco_data_test.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for build_coco_data.""" + +import json +import os + +from absl import flags +import numpy as np +from PIL import Image +import tensorflow as tf + +from deeplab2.data import build_coco_data +from deeplab2.data import coco_constants + +FLAGS = flags.FLAGS +_TEST_FILE_NAME = '000000123456.png' + + +class BuildCOCODataTest(tf.test.TestCase): + + def setUp(self): + super().setUp() + self.data_dir = FLAGS.test_tmpdir + self.height = 100 + self.width = 100 + self.split = 'train' + image_path = os.path.join(self.data_dir, + build_coco_data._FOLDERS_MAP[self.split]['image']) + panoptic_map_path = os.path.join(self.data_dir, + build_coco_data._FOLDERS_MAP + [self.split]['label']) + tf.io.gfile.makedirs(panoptic_map_path) + panoptic_map_path = os.path.join(panoptic_map_path, + 'panoptic_%s2017' % self.split) + + tf.io.gfile.makedirs(image_path) + tf.io.gfile.makedirs(panoptic_map_path) + self.panoptic_maps = {} + image_id = int(_TEST_FILE_NAME[:-4]) + self.panoptic_maps[image_id] = self._create_image_and_panoptic_map( + image_path, panoptic_map_path, image_id) + + def _create_image_and_panoptic_map(self, image_path, panoptic_path, image_id): + def id2rgb(id_map): + id_map_copy = id_map.copy() + rgb_shape = tuple(list(id_map.shape) + [3]) + rgb_map = np.zeros(rgb_shape, dtype=np.uint8) + for i in range(3): + rgb_map[..., i] = id_map_copy % 256 + id_map_copy //= 256 + return rgb_map + + # Creates dummy images and panoptic maps. + # Dummy image. + image = np.random.randint( + 0, 255, (self.height, self.width, 3), dtype=np.uint8) + with tf.io.gfile.GFile( + os.path.join(image_path, '%012d.jpg' % image_id), 'wb') as f: + Image.fromarray(image).save(f, format='JPEG') + + # Dummy panoptic map. + semantic = np.random.randint( + 0, 201, (self.height, self.width), dtype=np.int32) + instance_ = np.random.randint( + 0, 100, (self.height, self.width), dtype=np.int32) + id_mapping = coco_constants.get_id_mapping() + valid_semantic = id_mapping.keys() + for i in range(201): + if i not in valid_semantic: + mask = (semantic == i) + semantic[mask] = 0 + instance_[mask] = 0 + + instance = instance_.copy() + segments_info = [] + for sem in np.unique(semantic): + ins_id = 1 + if sem == 0: + continue + if id_mapping[sem] in build_coco_data._CLASS_HAS_INSTANCE_LIST: + for ins in np.unique(instance_[semantic == sem]): + instance[np.logical_and(semantic == sem, instance_ == ins)] = ins_id + area = np.logical_and(semantic == sem, instance_ == ins).sum() + idx = sem * 256 + ins_id + iscrowd = 0 + segments_info.append({ + 'id': idx.tolist(), + 'category_id': sem.tolist(), + 'area': area.tolist(), + 'iscrowd': iscrowd, + }) + ins_id += 1 + else: + instance[semantic == sem] = 0 + area = (semantic == sem).sum() + idx = sem * 256 + iscrowd = 0 + segments_info.append({ + 'id': idx.tolist(), + 'category_id': sem.tolist(), + 'area': area.tolist(), + 'iscrowd': iscrowd, + }) + + encoded_panoptic_map = semantic * 256 + instance + encoded_panoptic_map = id2rgb(encoded_panoptic_map) + with tf.io.gfile.GFile( + os.path.join(panoptic_path, '%012d.png' % image_id), 'wb') as f: + Image.fromarray(encoded_panoptic_map).save(f, format='PNG') + + for i in range(201): + if i in valid_semantic: + mask = (semantic == i) + semantic[mask] = id_mapping[i] + + decoded_panoptic_map = semantic * 256 + instance + + # Write json file + json_annotation = { + 'annotations': [ + { + 'file_name': _TEST_FILE_NAME, + 'image_id': int(_TEST_FILE_NAME[:-4]), + 'segments_info': segments_info + } + ] + } + json_annotation_path = os.path.join(self.data_dir, + build_coco_data._FOLDERS_MAP + [self.split]['label'], + 'panoptic_%s2017.json' % self.split) + with tf.io.gfile.GFile(json_annotation_path, 'w') as f: + json.dump(json_annotation, f, indent=2) + + return decoded_panoptic_map + + def test_build_coco_dataset_correct(self): + build_coco_data._convert_dataset( + coco_root=self.data_dir, + dataset_split=self.split, + output_dir=FLAGS.test_tmpdir) + output_record = os.path.join( + FLAGS.test_tmpdir, '%s-%05d-of-%05d.tfrecord' % + (self.split, 0, build_coco_data._NUM_SHARDS)) + self.assertTrue(tf.io.gfile.exists(output_record)) + + # Parses tf record. + image_ids = sorted(self.panoptic_maps) + for i, raw_record in enumerate( + tf.data.TFRecordDataset([output_record]).take(5)): + image_id = image_ids[i] + example = tf.train.Example.FromString(raw_record.numpy()) + panoptic_map = np.fromstring( + example.features.feature['image/segmentation/class/encoded'] + .bytes_list.value[0], + dtype=np.int32).reshape((self.height, self.width)) + np.testing.assert_array_equal(panoptic_map, self.panoptic_maps[image_id]) + +if __name__ == '__main__': + tf.test.main() diff --git a/data/build_dvps_data.py b/data/build_dvps_data.py new file mode 100644 index 0000000000000000000000000000000000000000..7057aae62cb23d8571e7c65f5bb3bf789a02b2f2 --- /dev/null +++ b/data/build_dvps_data.py @@ -0,0 +1,264 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Converts Depth-aware Video Panoptic Segmentation (DVPS) data to sharded TFRecord file format with tf.train.Example protos. + +The expected directory structure of the DVPS dataset should be as follows: + + + DVPS_ROOT + + train | val + - ground-truth depth maps (*_depth.png) + - ground-truth panoptic maps (*_gtFine_instanceTrainIds.png) + - images (*_leftImg8bit.png) + + test + - images (*_leftImg8bit.png) + +The ground-truth panoptic map is encoded as the following in PNG format: + + panoptic ID = semantic ID * panoptic divisor (1000) + instance ID + + +The output Example proto contains the following fields: + + image/encoded: encoded image content. + image/filename: image filename. + image/format: image file format. + image/height: image height. + image/width: image width. + image/channels: image channels. + image/segmentation/class/encoded: encoded panoptic segmentation content. + image/segmentation/class/format: segmentation encoding format. + image/depth/encoded: encoded depth content. + image/depth/format: depth encoding format. + video/sequence_id: sequence ID of the frame. + video/frame_id: ID of the frame of the video sequence. + next_image/encoded: encoded next-frame image content. + next_image/segmentation/class/encoded: encoded panoptic segmentation content + of the next frame. + +The output panoptic segmentation map stored in the Example will be the raw bytes +of an int32 panoptic map, where each pixel is assigned to a panoptic ID: + + panoptic ID = semantic ID * panoptic divisor (1000) + instance ID + +where semantic ID will be the same with `category_id` for each segment, and +ignore label for pixels not belong to any segment. + +The depth map will be the raw bytes of an int32 depth map, where each pixel is: + + depth map = depth ground truth * 256 + +Example to run the scipt: + + python deeplab2/data/build_dvps_data.py \ + --dvps_root=${DVPS_ROOT} \ + --output_dir=${OUTPUT_DIR} +""" + +import math +import os + +from typing import Sequence, Tuple, Optional + +from absl import app +from absl import flags +from absl import logging +import numpy as np + +from PIL import Image + +import tensorflow as tf + +from deeplab2.data import data_utils + +FLAGS = flags.FLAGS + +flags.DEFINE_string('dvps_root', None, 'DVPS dataset root folder.') + +flags.DEFINE_string('output_dir', None, + 'Path to save converted TFRecord of TensorFlow examples.') + +_PANOPTIC_DEPTH_FORMAT = 'raw' +_NUM_SHARDS = 1000 +_TF_RECORD_PATTERN = '%s-%05d-of-%05d.tfrecord' +_IMAGE_SUFFIX = '_leftImg8bit.png' +_LABEL_SUFFIX = '_gtFine_instanceTrainIds.png' +_DEPTH_SUFFIX = '_depth.png' + + +def _get_image_info_from_path(image_path: str) -> Tuple[str, str]: + """Gets image info including sequence id and image id. + + Image path is in the format of '{sequence_id}_{image_id}_*.png', + where `sequence_id` refers to the id of the video sequence, and `image_id` is + the id of the image in the video sequence. + + Args: + image_path: Absolute path of the image. + + Returns: + sequence_id, and image_id as strings. + """ + image_path = os.path.basename(image_path) + return tuple(image_path.split('_')[:2]) + + +def _get_images(dvps_root: str, dataset_split: str) -> Sequence[str]: + """Gets files for the specified data type and dataset split. + + Args: + dvps_root: String, path to DVPS dataset root folder. + dataset_split: String, dataset split ('train', 'val', 'test'). + + Returns: + A list of sorted file names under dvps_root and dataset_split. + """ + search_files = os.path.join(dvps_root, dataset_split, '*' + _IMAGE_SUFFIX) + filenames = tf.io.gfile.glob(search_files) + return sorted(filenames) + + +def _decode_panoptic_or_depth_map(map_path: str) -> Optional[str]: + """Decodes the panoptic or depth map from encoded image file. + + Args: + map_path: Path to the panoptic or depth map image file. + + Returns: + Panoptic or depth map as an encoded int32 numpy array bytes or None if not + existing. + """ + if not tf.io.gfile.exists(map_path): + return None + with tf.io.gfile.GFile(map_path, 'rb') as f: + decoded_map = np.array(Image.open(f)).astype(np.int32) + return decoded_map.tobytes() + + +def _get_next_frame_path(image_path: str) -> Optional[str]: + """Gets next frame path. + + If not exists, return None. + + The files are named {sequence_id}_{frame_id}*. To get the path of the next + frame, this function keeps sequence_id and increase the frame_id by 1. It + finds all the files matching this pattern, and returns the corresponding + file path matching the input type. + + Args: + image_path: String, path to the image. + + Returns: + A string for the path of the next frame of the given image path or None if + the given image path is the last frame of the sequence. + """ + sequence_id, image_id = _get_image_info_from_path(image_path) + next_image_id = '{:06d}'.format(int(image_id) + 1) + next_image_name = sequence_id + '_' + next_image_id + next_image_path = None + for suffix in (_IMAGE_SUFFIX, _LABEL_SUFFIX): + if image_path.endswith(suffix): + next_image_path = os.path.join( + os.path.dirname(image_path), next_image_name + suffix) + if not tf.io.gfile.exists(next_image_path): + return None + return next_image_path + + +def _create_tfexample(image_path: str, panoptic_map_path: str, + depth_map_path: str) -> Optional[tf.train.Example]: + """Creates a TF example for each image. + + Args: + image_path: Path to the image. + panoptic_map_path: Path to the panoptic map (as an image file). + depth_map_path: Path to the depth map (as an image file). + + Returns: + TF example proto. + """ + with tf.io.gfile.GFile(image_path, 'rb') as f: + image_data = f.read() + label_data = _decode_panoptic_or_depth_map(panoptic_map_path) + depth_data = _decode_panoptic_or_depth_map(depth_map_path) + image_name = os.path.basename(image_path) + image_format = image_name.split('.')[1].lower() + sequence_id, frame_id = _get_image_info_from_path(image_path) + next_image_data = None + next_label_data = None + # Next image. + next_image_path = _get_next_frame_path(image_path) + # If there is no next image, no examples will be created. + if next_image_path is None: + return None + with tf.io.gfile.GFile(next_image_path, 'rb') as f: + next_image_data = f.read() + # Next panoptic map. + next_panoptic_map_path = _get_next_frame_path(panoptic_map_path) + next_label_data = _decode_panoptic_or_depth_map(next_panoptic_map_path) + return data_utils.create_video_and_depth_tfexample( + image_data, + image_format, + image_name, + label_format=_PANOPTIC_DEPTH_FORMAT, + sequence_id=sequence_id, + image_id=frame_id, + label_data=label_data, + next_image_data=next_image_data, + next_label_data=next_label_data, + depth_data=depth_data, + depth_format=_PANOPTIC_DEPTH_FORMAT) + + +def _convert_dataset(dvps_root: str, dataset_split: str, output_dir: str): + """Converts the specified dataset split to TFRecord format. + + Args: + dvps_root: String, path to DVPS dataset root folder. + dataset_split: String, the dataset split (e.g., train, val, test). + output_dir: String, directory to write output TFRecords to. + """ + image_files = _get_images(dvps_root, dataset_split) + num_images = len(image_files) + + num_per_shard = int(math.ceil(len(image_files) / _NUM_SHARDS)) + + for shard_id in range(_NUM_SHARDS): + shard_filename = _TF_RECORD_PATTERN % (dataset_split, shard_id, _NUM_SHARDS) + output_filename = os.path.join(output_dir, shard_filename) + with tf.io.TFRecordWriter(output_filename) as tfrecord_writer: + start_idx = shard_id * num_per_shard + end_idx = min((shard_id + 1) * num_per_shard, num_images) + for i in range(start_idx, end_idx): + image_path = image_files[i] + panoptic_map_path = image_path.replace(_IMAGE_SUFFIX, _LABEL_SUFFIX) + depth_map_path = image_path.replace(_IMAGE_SUFFIX, _DEPTH_SUFFIX) + example = _create_tfexample(image_path, panoptic_map_path, + depth_map_path) + if example is not None: + tfrecord_writer.write(example.SerializeToString()) + + +def main(argv: Sequence[str]) -> None: + if len(argv) > 1: + raise app.UsageError('Too many command-line arguments.') + tf.io.gfile.makedirs(FLAGS.output_dir) + for dataset_split in ('train', 'val', 'test'): + logging.info('Starts to processing DVPS dataset split %s.', dataset_split) + _convert_dataset(FLAGS.dvps_root, dataset_split, FLAGS.output_dir) + + +if __name__ == '__main__': + app.run(main) diff --git a/data/build_step_data.py b/data/build_step_data.py new file mode 100644 index 0000000000000000000000000000000000000000..d08653cbb28661f93763f2af54525c541381879f --- /dev/null +++ b/data/build_step_data.py @@ -0,0 +1,298 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Converts STEP (KITTI-STEP or MOTChallenge-STEP) data to sharded TFRecord file format with tf.train.Example protos. + +The expected directory structure of the STEP dataset should be as follows: + + + {KITTI | MOTChallenge}-STEP + + images + + train + + sequence_id + - *.{png|jpg} + ... + + val + + test + + panoptic_maps + + train + + sequence_id + - *.png + ... + + val + +The ground-truth panoptic map is encoded as the following in PNG format: + + R: semantic_id + G: instance_id // 256 + B: instance % 256 + +See ./utils/create_step_panoptic_maps.py for more details of how we create the +panoptic map by merging semantic and instance maps. + +The output Example proto contains the following fields: + + image/encoded: encoded image content. + image/filename: image filename. + image/format: image file format. + image/height: image height. + image/width: image width. + image/channels: image channels. + image/segmentation/class/encoded: encoded panoptic segmentation content. + image/segmentation/class/format: segmentation encoding format. + video/sequence_id: sequence ID of the frame. + video/frame_id: ID of the frame of the video sequence. + +The output panoptic segmentation map stored in the Example will be the raw bytes +of an int32 panoptic map, where each pixel is assigned to a panoptic ID: + + panoptic ID = semantic ID * label divisor (1000) + instance ID + +where semantic ID will be the same with `category_id` (use TrainId) for +each segment, and ignore label for pixels not belong to any segment. + +The instance ID will be 0 for pixels belonging to + 1) `stuff` class + 2) `thing` class with `iscrowd` label + 3) pixels with ignore label +and [1, label divisor) otherwise. + +Example to run the scipt: + + python deeplab2/data/build_step_data.py \ + --step_root=${STEP_ROOT} \ + --output_dir=${OUTPUT_DIR} +""" + +import math +import os + +from typing import Iterator, Sequence, Tuple, Optional + +from absl import app +from absl import flags +from absl import logging +import numpy as np + +from PIL import Image + +import tensorflow as tf + +from deeplab2.data import data_utils + +FLAGS = flags.FLAGS + +flags.DEFINE_string('step_root', None, 'STEP dataset root folder.') + +flags.DEFINE_string('output_dir', None, + 'Path to save converted TFRecord of TensorFlow examples.') +flags.DEFINE_bool( + 'use_two_frames', False, 'Flag to separate between 1 frame ' + 'per TFExample or 2 consecutive frames per TFExample.') + +_PANOPTIC_LABEL_FORMAT = 'raw' +_NUM_SHARDS = 10 +_IMAGE_FOLDER_NAME = 'images' +_PANOPTIC_MAP_FOLDER_NAME = 'panoptic_maps' +_LABEL_MAP_FORMAT = 'png' +_INSTANCE_LABEL_DIVISOR = 1000 +_ENCODED_INSTANCE_LABEL_DIVISOR = 256 +_TF_RECORD_PATTERN = '%s-%05d-of-%05d.tfrecord' +_FRAME_ID_PATTERN = '%06d' + + +def _get_image_info_from_path(image_path: str) -> Tuple[str, str]: + """Gets image info including sequence id and image id. + + Image path is in the format of '.../split/sequence_id/image_id.png', + where `sequence_id` refers to the id of the video sequence, and `image_id` is + the id of the image in the video sequence. + + Args: + image_path: Absolute path of the image. + + Returns: + sequence_id, and image_id as strings. + """ + sequence_id = image_path.split('/')[-2] + image_id = os.path.splitext(os.path.basename(image_path))[0] + return sequence_id, image_id + + +def _get_images_per_shard(step_root: str, dataset_split: str, + sharded_by_sequence: bool) -> Iterator[Sequence[str]]: + """Gets files for the specified data type and dataset split. + + Args: + step_root: String, Path to STEP dataset root folder. + dataset_split: String, dataset split ('train', 'val', 'test') + sharded_by_sequence: Whether the images should be sharded by sequence or + even split. + + Yields: + A list of sorted file lists. Each inner list corresponds to one shard and is + a list of files for this shard. + """ + search_files = os.path.join(step_root, _IMAGE_FOLDER_NAME, dataset_split, '*', + '*') + filenames = sorted(tf.io.gfile.glob(search_files)) + num_per_even_shard = int(math.ceil(len(filenames) / _NUM_SHARDS)) + + sequence_ids = [os.path.basename(os.path.dirname(name)) for name in filenames] + images_per_shard = [] + for i, name in enumerate(filenames): + images_per_shard.append(name) + shard_data = (i == len(filenames) - 1) + # Sharded by sequence id. + shard_data = shard_data or (sharded_by_sequence and + sequence_ids[i + 1] != sequence_ids[i]) + # Sharded evenly. + shard_data = shard_data or (not sharded_by_sequence and + len(images_per_shard) == num_per_even_shard) + if shard_data: + yield images_per_shard + images_per_shard = [] + + +def _decode_panoptic_map(panoptic_map_path: str) -> Optional[str]: + """Decodes the panoptic map from encoded image file. + + Args: + panoptic_map_path: Path to the panoptic map image file. + + Returns: + Panoptic map as an encoded int32 numpy array bytes or None if not existing. + """ + if not tf.io.gfile.exists(panoptic_map_path): + return None + with tf.io.gfile.GFile(panoptic_map_path, 'rb') as f: + panoptic_map = np.array(Image.open(f)).astype(np.int32) + semantic_map = panoptic_map[:, :, 0] + instance_map = ( + panoptic_map[:, :, 1] * _ENCODED_INSTANCE_LABEL_DIVISOR + + panoptic_map[:, :, 2]) + panoptic_map = semantic_map * _INSTANCE_LABEL_DIVISOR + instance_map + return panoptic_map.tobytes() + + +def _get_previous_frame_path(image_path: str) -> str: + """Gets previous frame path. If not exists, duplicate it with image_path.""" + frame_id, frame_ext = os.path.splitext(os.path.basename(image_path)) + folder_dir = os.path.dirname(image_path) + prev_frame_id = _FRAME_ID_PATTERN % (int(frame_id) - 1) + prev_image_path = os.path.join(folder_dir, prev_frame_id + frame_ext) + # If first frame, duplicates it. + if not tf.io.gfile.exists(prev_image_path): + tf.compat.v1.logging.warn( + 'Could not find previous frame %s of frame %d, duplicate the previous ' + 'frame with the current frame.', prev_image_path, int(frame_id)) + prev_image_path = image_path + return prev_image_path + + +def _create_panoptic_tfexample(image_path: str, + panoptic_map_path: str, + use_two_frames: bool, + is_testing: bool = False) -> tf.train.Example: + """Creates a TF example for each image. + + Args: + image_path: Path to the image. + panoptic_map_path: Path to the panoptic map (as an image file). + use_two_frames: Whether to encode consecutive two frames in the Example. + is_testing: Whether it is testing data. If so, skip adding label data. + + Returns: + TF example proto. + """ + with tf.io.gfile.GFile(image_path, 'rb') as f: + image_data = f.read() + label_data = None + if not is_testing: + label_data = _decode_panoptic_map(panoptic_map_path) + image_name = os.path.basename(image_path) + image_format = image_name.split('.')[1].lower() + sequence_id, frame_id = _get_image_info_from_path(image_path) + prev_image_data = None + prev_label_data = None + if use_two_frames: + # Previous image. + prev_image_path = _get_previous_frame_path(image_path) + with tf.io.gfile.GFile(prev_image_path, 'rb') as f: + prev_image_data = f.read() + # Previous panoptic map. + if not is_testing: + prev_panoptic_map_path = _get_previous_frame_path(panoptic_map_path) + prev_label_data = _decode_panoptic_map(prev_panoptic_map_path) + return data_utils.create_video_tfexample( + image_data, + image_format, + image_name, + label_format=_PANOPTIC_LABEL_FORMAT, + sequence_id=sequence_id, + image_id=frame_id, + label_data=label_data, + prev_image_data=prev_image_data, + prev_label_data=prev_label_data) + + +def _convert_dataset(step_root: str, + dataset_split: str, + output_dir: str, + use_two_frames: bool = False): + """Converts the specified dataset split to TFRecord format. + + Args: + step_root: String, Path to STEP dataset root folder. + dataset_split: String, the dataset split (e.g., train, val). + output_dir: String, directory to write output TFRecords to. + use_two_frames: Whether to encode consecutive two frames in the Example. + """ + # For val and test set, if we run with use_two_frames, we should create a + # sorted tfrecord per sequence. + create_tfrecord_per_sequence = ('train' + not in dataset_split) and use_two_frames + is_testing = 'test' in dataset_split + + image_files_per_shard = list( + _get_images_per_shard(step_root, dataset_split, + sharded_by_sequence=create_tfrecord_per_sequence)) + num_shards = len(image_files_per_shard) + + for shard_id, image_list in enumerate(image_files_per_shard): + shard_filename = _TF_RECORD_PATTERN % (dataset_split, shard_id, num_shards) + output_filename = os.path.join(output_dir, shard_filename) + with tf.io.TFRecordWriter(output_filename) as tfrecord_writer: + for image_path in image_list: + sequence_id, image_id = _get_image_info_from_path(image_path) + panoptic_map_path = os.path.join( + step_root, _PANOPTIC_MAP_FOLDER_NAME, dataset_split, sequence_id, + '%s.%s' % (image_id, _LABEL_MAP_FORMAT)) + example = _create_panoptic_tfexample(image_path, panoptic_map_path, + use_two_frames, is_testing) + tfrecord_writer.write(example.SerializeToString()) + + +def main(argv: Sequence[str]) -> None: + if len(argv) > 1: + raise app.UsageError('Too many command-line arguments.') + tf.io.gfile.makedirs(FLAGS.output_dir) + for dataset_split in ('train', 'val', 'test'): + logging.info('Starts to processing STEP dataset split %s.', dataset_split) + _convert_dataset(FLAGS.step_root, dataset_split, FLAGS.output_dir, + FLAGS.use_two_frames) + + +if __name__ == '__main__': + app.run(main) diff --git a/data/build_step_data_test.py b/data/build_step_data_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b430b928829f997dc0d093fd9507b8c89550f6bc --- /dev/null +++ b/data/build_step_data_test.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for build_step_data.""" + +import os + +from absl import flags +import numpy as np +from PIL import Image +import tensorflow as tf + +from deeplab2.data import build_step_data + +FLAGS = flags.FLAGS + + +class BuildStepDataTest(tf.test.TestCase): + + def setUp(self): + super().setUp() + self.data_dir = FLAGS.test_tmpdir + self.height = 100 + self.width = 100 + self.sequence_id = '010' + + def _create_images(self, split): + image_path = os.path.join(self.data_dir, build_step_data._IMAGE_FOLDER_NAME, + split, self.sequence_id) + panoptic_map_path = os.path.join(self.data_dir, + build_step_data._PANOPTIC_MAP_FOLDER_NAME, + split, self.sequence_id) + + tf.io.gfile.makedirs(image_path) + tf.io.gfile.makedirs(panoptic_map_path) + self.panoptic_maps = {} + for image_id in [101, 100]: + self.panoptic_maps[image_id] = self._create_image_and_panoptic_map( + image_path, panoptic_map_path, image_id) + + def _create_image_and_panoptic_map(self, image_path, panoptic_path, image_id): + """Creates dummy images and panoptic maps.""" + # Dummy image. + image = np.random.randint( + 0, 255, (self.height, self.width, 3), dtype=np.uint8) + with tf.io.gfile.GFile( + os.path.join(image_path, '%06d.png' % image_id), 'wb') as f: + Image.fromarray(image).save(f, format='PNG') + + # Dummy panoptic map. + semantic = np.random.randint( + 0, 20, (self.height, self.width), dtype=np.int32) + instance = np.random.randint( + 0, 1000, (self.height, self.width), dtype=np.int32) + encoded_panoptic_map = np.dstack( + (semantic, instance // 256, instance % 256)).astype(np.uint8) + with tf.io.gfile.GFile( + os.path.join(panoptic_path, '%06d.png' % image_id), 'wb') as f: + Image.fromarray(encoded_panoptic_map).save(f, format='PNG') + decoded_panoptic_map = semantic * 1000 + instance + return decoded_panoptic_map + + def test_build_step_dataset_correct(self): + split = 'train' + self._create_images(split) + build_step_data._convert_dataset( + step_root=self.data_dir, + dataset_split=split, + output_dir=FLAGS.test_tmpdir) + # We will have 2 shards with each shard containing 1 image. + num_shards = 2 + output_record = os.path.join( + FLAGS.test_tmpdir, build_step_data._TF_RECORD_PATTERN % + (split, 0, num_shards)) + self.assertTrue(tf.io.gfile.exists(output_record)) + + # Parses tf record. + image_ids = sorted(self.panoptic_maps) + for i, raw_record in enumerate( + tf.data.TFRecordDataset([output_record]).take(5)): + image_id = image_ids[i] + example = tf.train.Example.FromString(raw_record.numpy()) + panoptic_map = np.fromstring( + example.features.feature['image/segmentation/class/encoded'] + .bytes_list.value[0], + dtype=np.int32).reshape((self.height, self.width)) + np.testing.assert_array_equal(panoptic_map, self.panoptic_maps[image_id]) + self.assertEqual( + example.features.feature['video/sequence_id'].bytes_list.value[0], + b'010') + self.assertEqual( + example.features.feature['video/frame_id'].bytes_list.value[0], + b'%06d' % image_id) + + def test_build_step_dataset_correct_with_two_frames(self): + split = 'train' + self._create_images(split) + build_step_data._convert_dataset( + step_root=self.data_dir, + dataset_split=split, + output_dir=FLAGS.test_tmpdir, use_two_frames=True) + num_shards = 2 + output_record = os.path.join( + FLAGS.test_tmpdir, build_step_data._TF_RECORD_PATTERN % + (split, 0, num_shards)) + self.assertTrue(tf.io.gfile.exists(output_record)) + + # Parses tf record. + image_ids = sorted(self.panoptic_maps) + for i, raw_record in enumerate( + tf.data.TFRecordDataset([output_record]).take(5)): + image_id = image_ids[i] + example = tf.train.Example.FromString(raw_record.numpy()) + panoptic_map = np.fromstring( + example.features.feature['image/segmentation/class/encoded'] + .bytes_list.value[0], + dtype=np.int32).reshape((self.height, self.width)) + np.testing.assert_array_equal(panoptic_map, self.panoptic_maps[image_id]) + prev_panoptic_map = np.fromstring( + example.features.feature['prev_image/segmentation/class/encoded'] + .bytes_list.value[0], + dtype=np.int32).reshape((self.height, self.width)) + if i == 0: + # First frame. + np.testing.assert_array_equal(panoptic_map, prev_panoptic_map) + else: + # Not a first frame. + np.testing.assert_array_equal(prev_panoptic_map, self.panoptic_maps[0]) + self.assertEqual( + example.features.feature['video/sequence_id'].bytes_list.value[0], + b'010') + self.assertEqual( + example.features.feature['video/frame_id'].bytes_list.value[0], + b'%06d' % image_id) + + def test_build_step_dataset_with_two_frames_shared_by_sequence(self): + split = 'val' + self._create_images(split) + build_step_data._convert_dataset( + step_root=self.data_dir, + dataset_split=split, + output_dir=FLAGS.test_tmpdir, use_two_frames=True) + # Only one shard since there is only one sequence for the val set. + num_shards = 1 + output_record = os.path.join( + FLAGS.test_tmpdir, build_step_data._TF_RECORD_PATTERN % + (split, 0, num_shards)) + self.assertTrue(tf.io.gfile.exists(output_record)) + + +if __name__ == '__main__': + tf.test.main() diff --git a/data/coco_constants.py b/data/coco_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..ac0a5ef03db71fa91e93bc9103319c71c2001941 --- /dev/null +++ b/data/coco_constants.py @@ -0,0 +1,865 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""File containing the meta info of COCO dataset. +""" + +import copy +from typing import Sequence, Mapping, Any + +_COCO_META = [ + { + 'color': [220, 20, 60], + 'isthing': 1, + 'id': 1, + 'name': 'person' + }, + { + 'color': [119, 11, 32], + 'isthing': 1, + 'id': 2, + 'name': 'bicycle' + }, + { + 'color': [0, 0, 142], + 'isthing': 1, + 'id': 3, + 'name': 'car' + }, + { + 'color': [0, 0, 230], + 'isthing': 1, + 'id': 4, + 'name': 'motorcycle' + }, + { + 'color': [106, 0, 228], + 'isthing': 1, + 'id': 5, + 'name': 'airplane' + }, + { + 'color': [0, 60, 100], + 'isthing': 1, + 'id': 6, + 'name': 'bus' + }, + { + 'color': [0, 80, 100], + 'isthing': 1, + 'id': 7, + 'name': 'train' + }, + { + 'color': [0, 0, 70], + 'isthing': 1, + 'id': 8, + 'name': 'truck' + }, + { + 'color': [0, 0, 192], + 'isthing': 1, + 'id': 9, + 'name': 'boat' + }, + { + 'color': [250, 170, 30], + 'isthing': 1, + 'id': 10, + 'name': 'traffic light' + }, + { + 'color': [100, 170, 30], + 'isthing': 1, + 'id': 11, + 'name': 'fire hydrant' + }, + { + 'color': [220, 220, 0], + 'isthing': 1, + 'id': 13, + 'name': 'stop sign' + }, + { + 'color': [175, 116, 175], + 'isthing': 1, + 'id': 14, + 'name': 'parking meter' + }, + { + 'color': [250, 0, 30], + 'isthing': 1, + 'id': 15, + 'name': 'bench' + }, + { + 'color': [165, 42, 42], + 'isthing': 1, + 'id': 16, + 'name': 'bird' + }, + { + 'color': [255, 77, 255], + 'isthing': 1, + 'id': 17, + 'name': 'cat' + }, + { + 'color': [0, 226, 252], + 'isthing': 1, + 'id': 18, + 'name': 'dog' + }, + { + 'color': [182, 182, 255], + 'isthing': 1, + 'id': 19, + 'name': 'horse' + }, + { + 'color': [0, 82, 0], + 'isthing': 1, + 'id': 20, + 'name': 'sheep' + }, + { + 'color': [120, 166, 157], + 'isthing': 1, + 'id': 21, + 'name': 'cow' + }, + { + 'color': [110, 76, 0], + 'isthing': 1, + 'id': 22, + 'name': 'elephant' + }, + { + 'color': [174, 57, 255], + 'isthing': 1, + 'id': 23, + 'name': 'bear' + }, + { + 'color': [199, 100, 0], + 'isthing': 1, + 'id': 24, + 'name': 'zebra' + }, + { + 'color': [72, 0, 118], + 'isthing': 1, + 'id': 25, + 'name': 'giraffe' + }, + { + 'color': [255, 179, 240], + 'isthing': 1, + 'id': 27, + 'name': 'backpack' + }, + { + 'color': [0, 125, 92], + 'isthing': 1, + 'id': 28, + 'name': 'umbrella' + }, + { + 'color': [209, 0, 151], + 'isthing': 1, + 'id': 31, + 'name': 'handbag' + }, + { + 'color': [188, 208, 182], + 'isthing': 1, + 'id': 32, + 'name': 'tie' + }, + { + 'color': [0, 220, 176], + 'isthing': 1, + 'id': 33, + 'name': 'suitcase' + }, + { + 'color': [255, 99, 164], + 'isthing': 1, + 'id': 34, + 'name': 'frisbee' + }, + { + 'color': [92, 0, 73], + 'isthing': 1, + 'id': 35, + 'name': 'skis' + }, + { + 'color': [133, 129, 255], + 'isthing': 1, + 'id': 36, + 'name': 'snowboard' + }, + { + 'color': [78, 180, 255], + 'isthing': 1, + 'id': 37, + 'name': 'sports ball' + }, + { + 'color': [0, 228, 0], + 'isthing': 1, + 'id': 38, + 'name': 'kite' + }, + { + 'color': [174, 255, 243], + 'isthing': 1, + 'id': 39, + 'name': 'baseball bat' + }, + { + 'color': [45, 89, 255], + 'isthing': 1, + 'id': 40, + 'name': 'baseball glove' + }, + { + 'color': [134, 134, 103], + 'isthing': 1, + 'id': 41, + 'name': 'skateboard' + }, + { + 'color': [145, 148, 174], + 'isthing': 1, + 'id': 42, + 'name': 'surfboard' + }, + { + 'color': [255, 208, 186], + 'isthing': 1, + 'id': 43, + 'name': 'tennis racket' + }, + { + 'color': [197, 226, 255], + 'isthing': 1, + 'id': 44, + 'name': 'bottle' + }, + { + 'color': [171, 134, 1], + 'isthing': 1, + 'id': 46, + 'name': 'wine glass' + }, + { + 'color': [109, 63, 54], + 'isthing': 1, + 'id': 47, + 'name': 'cup' + }, + { + 'color': [207, 138, 255], + 'isthing': 1, + 'id': 48, + 'name': 'fork' + }, + { + 'color': [151, 0, 95], + 'isthing': 1, + 'id': 49, + 'name': 'knife' + }, + { + 'color': [9, 80, 61], + 'isthing': 1, + 'id': 50, + 'name': 'spoon' + }, + { + 'color': [84, 105, 51], + 'isthing': 1, + 'id': 51, + 'name': 'bowl' + }, + { + 'color': [74, 65, 105], + 'isthing': 1, + 'id': 52, + 'name': 'banana' + }, + { + 'color': [166, 196, 102], + 'isthing': 1, + 'id': 53, + 'name': 'apple' + }, + { + 'color': [208, 195, 210], + 'isthing': 1, + 'id': 54, + 'name': 'sandwich' + }, + { + 'color': [255, 109, 65], + 'isthing': 1, + 'id': 55, + 'name': 'orange' + }, + { + 'color': [0, 143, 149], + 'isthing': 1, + 'id': 56, + 'name': 'broccoli' + }, + { + 'color': [179, 0, 194], + 'isthing': 1, + 'id': 57, + 'name': 'carrot' + }, + { + 'color': [209, 99, 106], + 'isthing': 1, + 'id': 58, + 'name': 'hot dog' + }, + { + 'color': [5, 121, 0], + 'isthing': 1, + 'id': 59, + 'name': 'pizza' + }, + { + 'color': [227, 255, 205], + 'isthing': 1, + 'id': 60, + 'name': 'donut' + }, + { + 'color': [147, 186, 208], + 'isthing': 1, + 'id': 61, + 'name': 'cake' + }, + { + 'color': [153, 69, 1], + 'isthing': 1, + 'id': 62, + 'name': 'chair' + }, + { + 'color': [3, 95, 161], + 'isthing': 1, + 'id': 63, + 'name': 'couch' + }, + { + 'color': [163, 255, 0], + 'isthing': 1, + 'id': 64, + 'name': 'potted plant' + }, + { + 'color': [119, 0, 170], + 'isthing': 1, + 'id': 65, + 'name': 'bed' + }, + { + 'color': [0, 182, 199], + 'isthing': 1, + 'id': 67, + 'name': 'dining table' + }, + { + 'color': [0, 165, 120], + 'isthing': 1, + 'id': 70, + 'name': 'toilet' + }, + { + 'color': [183, 130, 88], + 'isthing': 1, + 'id': 72, + 'name': 'tv' + }, + { + 'color': [95, 32, 0], + 'isthing': 1, + 'id': 73, + 'name': 'laptop' + }, + { + 'color': [130, 114, 135], + 'isthing': 1, + 'id': 74, + 'name': 'mouse' + }, + { + 'color': [110, 129, 133], + 'isthing': 1, + 'id': 75, + 'name': 'remote' + }, + { + 'color': [166, 74, 118], + 'isthing': 1, + 'id': 76, + 'name': 'keyboard' + }, + { + 'color': [219, 142, 185], + 'isthing': 1, + 'id': 77, + 'name': 'cell phone' + }, + { + 'color': [79, 210, 114], + 'isthing': 1, + 'id': 78, + 'name': 'microwave' + }, + { + 'color': [178, 90, 62], + 'isthing': 1, + 'id': 79, + 'name': 'oven' + }, + { + 'color': [65, 70, 15], + 'isthing': 1, + 'id': 80, + 'name': 'toaster' + }, + { + 'color': [127, 167, 115], + 'isthing': 1, + 'id': 81, + 'name': 'sink' + }, + { + 'color': [59, 105, 106], + 'isthing': 1, + 'id': 82, + 'name': 'refrigerator' + }, + { + 'color': [142, 108, 45], + 'isthing': 1, + 'id': 84, + 'name': 'book' + }, + { + 'color': [196, 172, 0], + 'isthing': 1, + 'id': 85, + 'name': 'clock' + }, + { + 'color': [95, 54, 80], + 'isthing': 1, + 'id': 86, + 'name': 'vase' + }, + { + 'color': [128, 76, 255], + 'isthing': 1, + 'id': 87, + 'name': 'scissors' + }, + { + 'color': [201, 57, 1], + 'isthing': 1, + 'id': 88, + 'name': 'teddy bear' + }, + { + 'color': [246, 0, 122], + 'isthing': 1, + 'id': 89, + 'name': 'hair drier' + }, + { + 'color': [191, 162, 208], + 'isthing': 1, + 'id': 90, + 'name': 'toothbrush' + }, + { + 'color': [255, 255, 128], + 'isthing': 0, + 'id': 92, + 'name': 'banner' + }, + { + 'color': [147, 211, 203], + 'isthing': 0, + 'id': 93, + 'name': 'blanket' + }, + { + 'color': [150, 100, 100], + 'isthing': 0, + 'id': 95, + 'name': 'bridge' + }, + { + 'color': [168, 171, 172], + 'isthing': 0, + 'id': 100, + 'name': 'cardboard' + }, + { + 'color': [146, 112, 198], + 'isthing': 0, + 'id': 107, + 'name': 'counter' + }, + { + 'color': [210, 170, 100], + 'isthing': 0, + 'id': 109, + 'name': 'curtain' + }, + { + 'color': [92, 136, 89], + 'isthing': 0, + 'id': 112, + 'name': 'door-stuff' + }, + { + 'color': [218, 88, 184], + 'isthing': 0, + 'id': 118, + 'name': 'floor-wood' + }, + { + 'color': [241, 129, 0], + 'isthing': 0, + 'id': 119, + 'name': 'flower' + }, + { + 'color': [217, 17, 255], + 'isthing': 0, + 'id': 122, + 'name': 'fruit' + }, + { + 'color': [124, 74, 181], + 'isthing': 0, + 'id': 125, + 'name': 'gravel' + }, + { + 'color': [70, 70, 70], + 'isthing': 0, + 'id': 128, + 'name': 'house' + }, + { + 'color': [255, 228, 255], + 'isthing': 0, + 'id': 130, + 'name': 'light' + }, + { + 'color': [154, 208, 0], + 'isthing': 0, + 'id': 133, + 'name': 'mirror-stuff' + }, + { + 'color': [193, 0, 92], + 'isthing': 0, + 'id': 138, + 'name': 'net' + }, + { + 'color': [76, 91, 113], + 'isthing': 0, + 'id': 141, + 'name': 'pillow' + }, + { + 'color': [255, 180, 195], + 'isthing': 0, + 'id': 144, + 'name': 'platform' + }, + { + 'color': [106, 154, 176], + 'isthing': 0, + 'id': 145, + 'name': 'playingfield' + }, + { + 'color': [230, 150, 140], + 'isthing': 0, + 'id': 147, + 'name': 'railroad' + }, + { + 'color': [60, 143, 255], + 'isthing': 0, + 'id': 148, + 'name': 'river' + }, + { + 'color': [128, 64, 128], + 'isthing': 0, + 'id': 149, + 'name': 'road' + }, + { + 'color': [92, 82, 55], + 'isthing': 0, + 'id': 151, + 'name': 'roof' + }, + { + 'color': [254, 212, 124], + 'isthing': 0, + 'id': 154, + 'name': 'sand' + }, + { + 'color': [73, 77, 174], + 'isthing': 0, + 'id': 155, + 'name': 'sea' + }, + { + 'color': [255, 160, 98], + 'isthing': 0, + 'id': 156, + 'name': 'shelf' + }, + { + 'color': [255, 255, 255], + 'isthing': 0, + 'id': 159, + 'name': 'snow' + }, + { + 'color': [104, 84, 109], + 'isthing': 0, + 'id': 161, + 'name': 'stairs' + }, + { + 'color': [169, 164, 131], + 'isthing': 0, + 'id': 166, + 'name': 'tent' + }, + { + 'color': [225, 199, 255], + 'isthing': 0, + 'id': 168, + 'name': 'towel' + }, + { + 'color': [137, 54, 74], + 'isthing': 0, + 'id': 171, + 'name': 'wall-brick' + }, + { + 'color': [135, 158, 223], + 'isthing': 0, + 'id': 175, + 'name': 'wall-stone' + }, + { + 'color': [7, 246, 231], + 'isthing': 0, + 'id': 176, + 'name': 'wall-tile' + }, + { + 'color': [107, 255, 200], + 'isthing': 0, + 'id': 177, + 'name': 'wall-wood' + }, + { + 'color': [58, 41, 149], + 'isthing': 0, + 'id': 178, + 'name': 'water-other' + }, + { + 'color': [183, 121, 142], + 'isthing': 0, + 'id': 180, + 'name': 'window-blind' + }, + { + 'color': [255, 73, 97], + 'isthing': 0, + 'id': 181, + 'name': 'window-other' + }, + { + 'color': [107, 142, 35], + 'isthing': 0, + 'id': 184, + 'name': 'tree-merged' + }, + { + 'color': [190, 153, 153], + 'isthing': 0, + 'id': 185, + 'name': 'fence-merged' + }, + { + 'color': [146, 139, 141], + 'isthing': 0, + 'id': 186, + 'name': 'ceiling-merged' + }, + { + 'color': [70, 130, 180], + 'isthing': 0, + 'id': 187, + 'name': 'sky-other-merged' + }, + { + 'color': [134, 199, 156], + 'isthing': 0, + 'id': 188, + 'name': 'cabinet-merged' + }, + { + 'color': [209, 226, 140], + 'isthing': 0, + 'id': 189, + 'name': 'table-merged' + }, + { + 'color': [96, 36, 108], + 'isthing': 0, + 'id': 190, + 'name': 'floor-other-merged' + }, + { + 'color': [96, 96, 96], + 'isthing': 0, + 'id': 191, + 'name': 'pavement-merged' + }, + { + 'color': [64, 170, 64], + 'isthing': 0, + 'id': 192, + 'name': 'mountain-merged' + }, + { + 'color': [152, 251, 152], + 'isthing': 0, + 'id': 193, + 'name': 'grass-merged' + }, + { + 'color': [208, 229, 228], + 'isthing': 0, + 'id': 194, + 'name': 'dirt-merged' + }, + { + 'color': [206, 186, 171], + 'isthing': 0, + 'id': 195, + 'name': 'paper-merged' + }, + { + 'color': [152, 161, 64], + 'isthing': 0, + 'id': 196, + 'name': 'food-other-merged' + }, + { + 'color': [116, 112, 0], + 'isthing': 0, + 'id': 197, + 'name': 'building-other-merged' + }, + { + 'color': [0, 114, 143], + 'isthing': 0, + 'id': 198, + 'name': 'rock-merged' + }, + { + 'color': [102, 102, 156], + 'isthing': 0, + 'id': 199, + 'name': 'wall-other-merged' + }, + { + 'color': [250, 141, 255], + 'isthing': 0, + 'id': 200, + 'name': 'rug-merged' + }, +] + + +def get_coco_meta() -> Sequence[Any]: + return copy.deepcopy(_COCO_META) + + +def get_id_mapping() -> Mapping[int, int]: + """Creates a dictionary mapping the original category_id into continuous ones. + + Specifically, in coco annotations, category_id ranges from 1 to 200. Since not + every id between 1 to 200 is used, we map them to contiguous ids (1 to 133), + which saves memory and computation to some degree. + + Returns: + A dictionary mapping original category id to contiguous category ids. + """ + id_mapping = {} + for i in range(len(_COCO_META)): + id_mapping[_COCO_META[i]['id']] = i + 1 + return id_mapping + + +def get_id_mapping_inverse() -> Sequence[int]: + """Creates a tuple mapping the continuous ids back to original ones. + + Specifically, in coco annotations, category_id ranges from 1 to 200. Since not + every id between 1 to 200 is used, we map them to contiguous ids (1 to 133) + via the function get_id_mapping, which saves memory and computation to some + degree. This function supports remapping back from the contiguous ids to the + original ones, which is required for COCO official evaluation. + + Returns: + A dictionary mapping contiguous category ids to original COCO category id. + """ + id_mapping_inverse = (0,) + tuple([ori_cat['id'] for ori_cat in _COCO_META]) + return id_mapping_inverse + + +def get_coco_reduced_meta() -> Sequence[Any]: + coco_reduced_meta = get_coco_meta() + id_mapping = get_id_mapping() + for i in range(len(coco_reduced_meta)): + coco_reduced_meta[i].update({'id': id_mapping[coco_reduced_meta[i]['id']]}) + return coco_reduced_meta diff --git a/data/data_utils.py b/data/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..74260d974565402ca50a7726c280a721358e6502 --- /dev/null +++ b/data/data_utils.py @@ -0,0 +1,391 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Contains common utility functions and classes for building dataset.""" + +import collections +import io + +import numpy as np +from PIL import Image +from PIL import ImageOps +import tensorflow as tf + +from deeplab2 import common + +_PANOPTIC_LABEL_FORMAT = 'raw' + + +def read_image(image_data): + """Decodes image from in-memory data. + + Args: + image_data: Bytes data representing encoded image. + + Returns: + Decoded PIL.Image object. + """ + image = Image.open(io.BytesIO(image_data)) + + try: + image = ImageOps.exif_transpose(image) + except TypeError: + # capture and ignore this bug: + # https://github.com/python-pillow/Pillow/issues/3973 + pass + + return image + + +def get_image_dims(image_data, check_is_rgb=False): + """Decodes image and return its height and width. + + Args: + image_data: Bytes data representing encoded image. + check_is_rgb: Whether to check encoded image is RGB. + + Returns: + Decoded image size as a tuple of (height, width) + + Raises: + ValueError: If check_is_rgb is set and input image has other format. + """ + image = read_image(image_data) + + if check_is_rgb and image.mode != 'RGB': + raise ValueError('Expects RGB image data, gets mode: %s' % image.mode) + + width, height = image.size + return height, width + + +def _int64_list_feature(values): + """Returns a TF-Feature of int64_list. + + Args: + values: A scalar or an iterable of integer values. + + Returns: + A TF-Feature. + """ + if not isinstance(values, collections.Iterable): + values = [values] + + return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) + + +def _bytes_list_feature(values): + """Returns a TF-Feature of bytes. + + Args: + values: A string. + + Returns: + A TF-Feature. + """ + if isinstance(values, str): + values = values.encode() + + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) + + +def create_features(image_data, + image_format, + filename, + label_data=None, + label_format=None): + """Creates image/segmentation features. + + Args: + image_data: String or byte stream of encoded image data. + image_format: String, image data format, should be either 'jpeg' or 'png'. + filename: String, image filename. + label_data: String or byte stream of (potentially) encoded label data. If + None, we skip to write it to tf.train.Example. + label_format: String, label data format, should be either 'png' or 'raw'. If + None, we skip to write it to tf.train.Example. + + Returns: + A dictionary of feature name to tf.train.Feature maaping. + """ + if image_format not in ('jpeg', 'png'): + raise ValueError('Unsupported image format: %s' % image_format) + + # Check color mode, and convert grey image to rgb image. + image = read_image(image_data) + if image.mode != 'RGB': + image = image.convert('RGB') + image_data = io.BytesIO() + image.save(image_data, format=image_format) + image_data = image_data.getvalue() + + height, width = get_image_dims(image_data, check_is_rgb=True) + + feature_dict = { + common.KEY_ENCODED_IMAGE: _bytes_list_feature(image_data), + common.KEY_IMAGE_FILENAME: _bytes_list_feature(filename), + common.KEY_IMAGE_FORMAT: _bytes_list_feature(image_format), + common.KEY_IMAGE_HEIGHT: _int64_list_feature(height), + common.KEY_IMAGE_WIDTH: _int64_list_feature(width), + common.KEY_IMAGE_CHANNELS: _int64_list_feature(3), + } + + if label_data is None: + return feature_dict + + if label_format == 'png': + label_height, label_width = get_image_dims(label_data) + if (label_height, label_width) != (height, width): + raise ValueError('Image (%s) and label (%s) shape mismatch' % + ((height, width), (label_height, label_width))) + elif label_format == 'raw': + # Raw label encodes int32 array. + expected_label_size = height * width * np.dtype(np.int32).itemsize + if len(label_data) != expected_label_size: + raise ValueError('Expects raw label data length %d, gets %d' % + (expected_label_size, len(label_data))) + else: + raise ValueError('Unsupported label format: %s' % label_format) + + feature_dict.update({ + common.KEY_ENCODED_LABEL: _bytes_list_feature(label_data), + common.KEY_LABEL_FORMAT: _bytes_list_feature(label_format) + }) + + return feature_dict + + +def create_tfexample(image_data, + image_format, + filename, + label_data=None, + label_format=None): + """Converts one image/segmentation pair to TF example. + + Args: + image_data: String or byte stream of encoded image data. + image_format: String, image data format, should be either 'jpeg' or 'png'. + filename: String, image filename. + label_data: String or byte stream of (potentially) encoded label data. If + None, we skip to write it to tf.train.Example. + label_format: String, label data format, should be either 'png' or 'raw'. If + None, we skip to write it to tf.train.Example. + + Returns: + TF example proto. + """ + feature_dict = create_features(image_data, image_format, filename, label_data, + label_format) + return tf.train.Example(features=tf.train.Features(feature=feature_dict)) + + +def create_video_tfexample(image_data, + image_format, + filename, + sequence_id, + image_id, + label_data=None, + label_format=None, + prev_image_data=None, + prev_label_data=None): + """Converts one video frame/panoptic segmentation pair to TF example. + + Args: + image_data: String or byte stream of encoded image data. + image_format: String, image data format, should be either 'jpeg' or 'png'. + filename: String, image filename. + sequence_id: ID of the video sequence as a string. + image_id: ID of the image as a string. + label_data: String or byte stream of (potentially) encoded label data. If + None, we skip to write it to tf.train.Example. + label_format: String, label data format, should be either 'png' or 'raw'. If + None, we skip to write it to tf.train.Example. + prev_image_data: An optional string or byte stream of encoded previous image + data. + prev_label_data: An optional string or byte stream of (potentially) encoded + previous label data. + + Returns: + TF example proto. + """ + feature_dict = create_features(image_data, image_format, filename, label_data, + label_format) + feature_dict.update({ + common.KEY_SEQUENCE_ID: _bytes_list_feature(sequence_id), + common.KEY_FRAME_ID: _bytes_list_feature(image_id) + }) + if prev_image_data is not None: + feature_dict[common.KEY_ENCODED_PREV_IMAGE] = _bytes_list_feature( + prev_image_data) + if prev_label_data is not None: + feature_dict[common.KEY_ENCODED_PREV_LABEL] = _bytes_list_feature( + prev_label_data) + return tf.train.Example(features=tf.train.Features(feature=feature_dict)) + + +def create_video_and_depth_tfexample(image_data, + image_format, + filename, + sequence_id, + image_id, + label_data=None, + label_format=None, + next_image_data=None, + next_label_data=None, + depth_data=None, + depth_format=None): + """Converts an image/segmentation pair and depth of first frame to TF example. + + The image pair contains the current frame and the next frame with the + current frame including depth label. + + Args: + image_data: String or byte stream of encoded image data. + image_format: String, image data format, should be either 'jpeg' or 'png'. + filename: String, image filename. + sequence_id: ID of the video sequence as a string. + image_id: ID of the image as a string. + label_data: String or byte stream of (potentially) encoded label data. If + None, we skip to write it to tf.train.Example. + label_format: String, label data format, should be either 'png' or 'raw'. If + None, we skip to write it to tf.train.Example. + next_image_data: An optional string or byte stream of encoded next image + data. + next_label_data: An optional string or byte stream of (potentially) encoded + next label data. + depth_data: An optional string or byte sream of encoded depth data. + depth_format: String, depth data format, should be either 'png' or 'raw'. + + Returns: + TF example proto. + """ + feature_dict = create_features(image_data, image_format, filename, label_data, + label_format) + feature_dict.update({ + common.KEY_SEQUENCE_ID: _bytes_list_feature(sequence_id), + common.KEY_FRAME_ID: _bytes_list_feature(image_id) + }) + if next_image_data is not None: + feature_dict[common.KEY_ENCODED_NEXT_IMAGE] = _bytes_list_feature( + next_image_data) + if next_label_data is not None: + feature_dict[common.KEY_ENCODED_NEXT_LABEL] = _bytes_list_feature( + next_label_data) + if depth_data is not None: + feature_dict[common.KEY_ENCODED_DEPTH] = _bytes_list_feature( + depth_data) + feature_dict[common.KEY_DEPTH_FORMAT] = _bytes_list_feature( + depth_format) + return tf.train.Example(features=tf.train.Features(feature=feature_dict)) + + +class SegmentationDecoder(object): + """Basic parser to decode serialized tf.Example.""" + + def __init__(self, + is_panoptic_dataset=True, + is_video_dataset=False, + use_two_frames=False, + use_next_frame=False, + decode_groundtruth_label=True): + self._is_panoptic_dataset = is_panoptic_dataset + self._is_video_dataset = is_video_dataset + self._use_two_frames = use_two_frames + self._use_next_frame = use_next_frame + self._decode_groundtruth_label = decode_groundtruth_label + string_feature = tf.io.FixedLenFeature((), tf.string) + int_feature = tf.io.FixedLenFeature((), tf.int64) + self._keys_to_features = { + common.KEY_ENCODED_IMAGE: string_feature, + common.KEY_IMAGE_FILENAME: string_feature, + common.KEY_IMAGE_FORMAT: string_feature, + common.KEY_IMAGE_HEIGHT: int_feature, + common.KEY_IMAGE_WIDTH: int_feature, + common.KEY_IMAGE_CHANNELS: int_feature, + } + if decode_groundtruth_label: + self._keys_to_features[common.KEY_ENCODED_LABEL] = string_feature + if self._is_video_dataset: + self._keys_to_features[common.KEY_SEQUENCE_ID] = string_feature + self._keys_to_features[common.KEY_FRAME_ID] = string_feature + # Two-frame specific processing. + if self._use_two_frames: + self._keys_to_features[common.KEY_ENCODED_PREV_IMAGE] = string_feature + if decode_groundtruth_label: + self._keys_to_features[common.KEY_ENCODED_PREV_LABEL] = string_feature + # Next-frame specific processing. + if self._use_next_frame: + self._keys_to_features[common.KEY_ENCODED_NEXT_IMAGE] = string_feature + if decode_groundtruth_label: + self._keys_to_features[common.KEY_ENCODED_NEXT_LABEL] = string_feature + + def _decode_image(self, parsed_tensors, key): + """Decodes image udner key from parsed tensors.""" + image = tf.io.decode_image( + parsed_tensors[key], + channels=3, + dtype=tf.dtypes.uint8, + expand_animations=False) + image.set_shape([None, None, 3]) + return image + + def _decode_label(self, parsed_tensors, label_key): + """Decodes segmentation label under label_key from parsed tensors.""" + if self._is_panoptic_dataset: + flattened_label = tf.io.decode_raw( + parsed_tensors[label_key], out_type=tf.int32) + label_shape = tf.stack([ + parsed_tensors[common.KEY_IMAGE_HEIGHT], + parsed_tensors[common.KEY_IMAGE_WIDTH], 1 + ]) + label = tf.reshape(flattened_label, label_shape) + return label + + label = tf.io.decode_image(parsed_tensors[label_key], channels=1) + label.set_shape([None, None, 1]) + return label + + def __call__(self, serialized_example): + parsed_tensors = tf.io.parse_single_example( + serialized_example, features=self._keys_to_features) + return_dict = { + 'image': + self._decode_image(parsed_tensors, common.KEY_ENCODED_IMAGE), + 'image_name': + parsed_tensors[common.KEY_IMAGE_FILENAME], + 'height': + tf.cast(parsed_tensors[common.KEY_IMAGE_HEIGHT], dtype=tf.int32), + 'width': + tf.cast(parsed_tensors[common.KEY_IMAGE_WIDTH], dtype=tf.int32), + } + return_dict['label'] = None + if self._decode_groundtruth_label: + return_dict['label'] = self._decode_label(parsed_tensors, + common.KEY_ENCODED_LABEL) + if self._is_video_dataset: + return_dict['sequence'] = parsed_tensors[common.KEY_SEQUENCE_ID] + if self._use_two_frames: + return_dict['prev_image'] = self._decode_image( + parsed_tensors, common.KEY_ENCODED_PREV_IMAGE) + if self._decode_groundtruth_label: + return_dict['prev_label'] = self._decode_label( + parsed_tensors, common.KEY_ENCODED_PREV_LABEL) + if self._use_next_frame: + return_dict['next_image'] = self._decode_image( + parsed_tensors, common.KEY_ENCODED_NEXT_IMAGE) + if self._decode_groundtruth_label: + return_dict['next_label'] = self._decode_label( + parsed_tensors, common.KEY_ENCODED_NEXT_LABEL) + return return_dict diff --git a/data/data_utils_test.py b/data/data_utils_test.py new file mode 100644 index 0000000000000000000000000000000000000000..e87ba80eaa2f7099bff65f84d725dbbdcd99f161 --- /dev/null +++ b/data/data_utils_test.py @@ -0,0 +1,94 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for data_utils.""" + +import io +import numpy as np +from PIL import Image +import tensorflow as tf + +from deeplab2.data import data_utils + + +def _encode_png_image(image): + """Helper method to encode input image in PNG format.""" + buffer = io.BytesIO() + Image.fromarray(image).save(buffer, format='png') + return buffer.getvalue() + + +class DataUtilsTest(tf.test.TestCase): + + def _create_test_image(self, height, width): + rng = np.random.RandomState(319281498) + return rng.randint(0, 255, size=(height, width, 3), dtype=np.uint8) + + def test_encode_and_decode(self): + """Checks decode created tf.Example for semantic segmentation.""" + test_image_height = 20 + test_image_width = 15 + filename = 'dummy' + + image = self._create_test_image(test_image_height, test_image_width) + # Take the last channel as dummy label. + label = image[..., 0] + + example = data_utils.create_tfexample( + image_data=_encode_png_image(image), + image_format='png', filename=filename, + label_data=_encode_png_image(label), label_format='png') + + # Parse created example, expect getting identical results. + parser = data_utils.SegmentationDecoder(is_panoptic_dataset=False) + parsed_tensors = parser(example.SerializeToString()) + + self.assertIn('image', parsed_tensors) + self.assertIn('image_name', parsed_tensors) + self.assertIn('label', parsed_tensors) + self.assertEqual(filename, parsed_tensors['image_name']) + np.testing.assert_array_equal(image, parsed_tensors['image'].numpy()) + # Decoded label is a 3-D array with last dimension of 1. + decoded_label = parsed_tensors['label'].numpy() + np.testing.assert_array_equal(label, decoded_label[..., 0]) + + def test_encode_and_decode_panoptic(self): + test_image_height = 31 + test_image_width = 17 + filename = 'dummy' + + image = self._create_test_image(test_image_height, test_image_width) + # Create dummy panoptic label in np.int32 dtype. + label = np.dot(image.astype(np.int32), [1, 256, 256 * 256]).astype(np.int32) + example = data_utils.create_tfexample( + image_data=_encode_png_image(image), + image_format='png', filename=filename, + label_data=label.tostring(), label_format='raw') + + parser = data_utils.SegmentationDecoder(is_panoptic_dataset=True) + parsed_tensors = parser(example.SerializeToString()) + + self.assertIn('image', parsed_tensors) + self.assertIn('image_name', parsed_tensors) + self.assertIn('label', parsed_tensors) + self.assertEqual(filename, parsed_tensors['image_name']) + np.testing.assert_array_equal(image, parsed_tensors['image'].numpy()) + # Decoded label is a 3-D array with last dimension of 1. + decoded_label = parsed_tensors['label'].numpy() + np.testing.assert_array_equal(label, decoded_label[..., 0]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/data/dataloader/__init__.py b/data/dataloader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03 --- /dev/null +++ b/data/dataloader/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/data/dataloader/input_reader.py b/data/dataloader/input_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..cbf384e6f7ff8e3188bf2f98af9d8a14bee15a59 --- /dev/null +++ b/data/dataloader/input_reader.py @@ -0,0 +1,91 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Input reader to load segmentation dataset.""" + +import tensorflow as tf + +_NUM_INPUTS_PROCESSED_CONCURRENTLY = 32 +_SHUFFLE_BUFFER_SIZE = 1000 + + +class InputReader(object): + """Input function that creates a dataset from files.""" + + def __init__(self, + file_pattern, + decoder_fn, + generator_fn=None, + is_training=False): + """Initializes the input reader. + + Args: + file_pattern: The file pattern for the data example, in TFRecord format + decoder_fn: A callable that takes a serialized tf.Example and produces + parsed (and potentially processed / augmented) tensors. + generator_fn: An optional `callable` that takes the decoded raw tensors + dict and generates a ground-truth dictionary that can be consumed by + the model. It will be executed after decoder_fn (default: None). + is_training: If this dataset is used for training or not (default: False). + """ + self._file_pattern = file_pattern + self._is_training = is_training + self._decoder_fn = decoder_fn + self._generator_fn = generator_fn + + def __call__(self, batch_size=1, max_num_examples=-1): + """Provides tf.data.Dataset object. + + Args: + batch_size: Expected batch size input data. + max_num_examples: Positive integer or -1. If positive, the returned + dataset will only take (at most) this number of examples and raise + tf.errors.OutOfRangeError after that (default: -1). + + Returns: + tf.data.Dataset object. + """ + dataset = tf.data.Dataset.list_files(self._file_pattern) + + if self._is_training: + # File level shuffle. + dataset = dataset.shuffle(dataset.cardinality(), + reshuffle_each_iteration=True) + dataset = dataset.repeat() + + # During training, interleave TFRecord conversion for maximum efficiency. + # During evaluation, read input in consecutive order for tasks requiring + # such behavior. + dataset = dataset.interleave( + map_func=tf.data.TFRecordDataset, + cycle_length=(_NUM_INPUTS_PROCESSED_CONCURRENTLY + if self._is_training else 1), + num_parallel_calls=tf.data.experimental.AUTOTUNE, + deterministic=not self._is_training) + + if self._is_training: + dataset = dataset.shuffle(_SHUFFLE_BUFFER_SIZE) + if max_num_examples > 0: + dataset = dataset.take(max_num_examples) + + # Parses the fetched records to input tensors for model function. + dataset = dataset.map( + self._decoder_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) + if self._generator_fn is not None: + dataset = dataset.map( + self._generator_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) + dataset = dataset.batch(batch_size, drop_remainder=True) + dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + return dataset diff --git a/data/dataset.py b/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e6257830d5c676c89f1a0f6fbb1066e12cf7c8ad --- /dev/null +++ b/data/dataset.py @@ -0,0 +1,228 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Provides data from segmentation datasets. + +Currently, we support the following datasets: + +1. Cityscapes dataset (https://www.cityscapes-dataset.com). + +The Cityscapes dataset contains 19 semantic labels (such as road, person, car, +and so on) for urban street scenes. + + +2. KITTI-STEP (http://www.cvlibs.net/datasets/kitti/). + +The KITTI-STEP enriches the KITTI-MOTS data with additional `stuff' +anntotations. + +3. MOTChallenge-STEP (https://motchallenge.net/). + +The MOTChallenge-STEP enriches the MOTSChallenge data with additional `stuff' +annotations. + +4. MSCOCO panoptic segmentation (http://cocodataset.org/#panoptic-2018). + +Panoptic segmentation annotations for MSCOCO dataset. Note that we convert the +provided MSCOCO panoptic segmentation format to the following one: +panoptic label = semantic label * 256 + instance id. + +5. Cityscapes-DVPS (https://github.com/joe-siyuan-qiao/ViP-DeepLab) +The Cityscapes-DVPS dataset augments Cityscapes-VPS +(https://github.com/mcahny/vps) with depth annotations. + + +References: + +- Marius Cordts, Mohamed Omran, Sebastian Ramos, Timo Rehfeld, Markus + Enzweiler, Rodrigo Benenson, Uwe Franke, Stefan Roth, and Bernt Schiele, "The + Cityscapes Dataset for Semantic Urban Scene Understanding." In CVPR, 2016. + +- Andreas Geiger and Philip Lenz and Raquel Urtasun, "Are we ready for + Autonomous Driving? The KITTI Vision Benchmark Suite." In CVPR, 2012. + +- Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, and Piotr + Dollar, "Panoptic Segmentation." In CVPR, 2019. + +- Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. + Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollar, and C. + Lawrence Zitnick, "Microsoft COCO: common objects in context." In ECCV, 2014. + +- Anton Milan, Laura Leal-Taixe, Ian Reid, Stefan Roth, and Konrad Schindler, + "Mot16: A benchmark for multi-object tracking." arXiv:1603.00831, 2016. + +- Paul Voigtlaender, Michael Krause, Aljosa Osep, Jonathon Luiten, Berin + Balachandar Gnana Sekar, Andreas Geiger, and Bastian Leibe. "MOTS: + Multi-object tracking and segmentation." In CVPR, 2019 + +- Mark Weber, Jun Xie, Maxwell Collins, Yukun Zhu, Paul Voigtlaender, Hartwig + Adam, Bradley Green, Andreas Geiger, Bastian Leibe, Daniel Cremers, Aljosa + Osep, Laura Leal-Taixe, and Liang-Chieh Chen, "STEP: Segmenting and Tracking + Every Pixel." arXiv: 2102.11859, 2021. + +- Dahun Kim, Sanghyun Woo, Joon-Young Lee, and In So Kweon. "Video panoptic + segmentation." In CVPR, 2020. + +- Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen. + "ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic + Segmentation." In CVPR, 2021. +""" + +import collections + + +# Dataset names. +_CITYSCAPES = 'cityscapes' +_CITYSCAPES_PANOPTIC = 'cityscapes_panoptic' +_KITTI_STEP = 'kitti_step' +_MOTCHALLENGE_STEP = 'motchallenge_step' +_CITYSCAPES_DVPS = 'cityscapes_dvps' +_COCO_PANOPTIC = 'coco_panoptic' + +# Colormap names. +_CITYSCAPES_COLORMAP = 'cityscapes' +_MOTCHALLENGE_COLORMAP = 'motchallenge' +_COCO_COLORMAP = 'coco' + + +# Named tuple to describe dataset properties. +DatasetDescriptor = collections.namedtuple( + 'DatasetDescriptor', [ + 'dataset_name', # Dataset name. + 'splits_to_sizes', # Splits of the dataset into training, val and test. + 'num_classes', # Number of semantic classes. + 'ignore_label', # Ignore label value used for semantic segmentation. + + # Fields below are used for panoptic segmentation and will be None for + # Semantic segmentation datasets. + # Label divisor only used in panoptic segmentation annotation to infer + # semantic label and instance id. + 'panoptic_label_divisor', + # A tuple of classes that contains instance annotations. For example, + # 'person' class has instance annotations while 'sky' does not. + 'class_has_instances_list', + # A flag indicating whether the dataset is a video dataset that contains + # sequence IDs and frame IDs. + 'is_video_dataset', + # A string specifying the colormap that should be used for + # visualization. E.g. 'cityscapes'. + 'colormap', + # A flag indicating whether the dataset contains depth annotation. + 'is_depth_dataset', + ] +) + +CITYSCAPES_INFORMATION = DatasetDescriptor( + dataset_name=_CITYSCAPES, + splits_to_sizes={'train_fine': 2975, + 'train_coarse': 22973, + 'trainval_fine': 3475, + 'trainval_coarse': 23473, + 'val_fine': 500, + 'test_fine': 1525}, + num_classes=19, + ignore_label=255, + panoptic_label_divisor=None, + class_has_instances_list=None, + is_video_dataset=False, + colormap=_CITYSCAPES_COLORMAP, + is_depth_dataset=False, +) + +CITYSCAPES_PANOPTIC_INFORMATION = DatasetDescriptor( + dataset_name=_CITYSCAPES_PANOPTIC, + splits_to_sizes={'train_fine': 2975, + 'val_fine': 500, + 'trainval_fine': 3475, + 'test_fine': 1525}, + num_classes=19, + ignore_label=255, + panoptic_label_divisor=1000, + class_has_instances_list=tuple(range(11, 19)), + is_video_dataset=False, + colormap=_CITYSCAPES_COLORMAP, + is_depth_dataset=False, +) + +KITTI_STEP_INFORMATION = DatasetDescriptor( + dataset_name=_KITTI_STEP, + splits_to_sizes={'train': 5027, + 'val': 2981, + 'test': 11095}, + num_classes=19, + ignore_label=255, + panoptic_label_divisor=1000, + class_has_instances_list=(11, 13), + is_video_dataset=True, + colormap=_CITYSCAPES_COLORMAP, + is_depth_dataset=False, +) + +MOTCHALLENGE_STEP_INFORMATION = DatasetDescriptor( + dataset_name=_MOTCHALLENGE_STEP, + splits_to_sizes={'train': 525, # Sequence 9. + 'val': 600, # Sequence 2. + 'test': 0}, + num_classes=7, + ignore_label=255, + panoptic_label_divisor=1000, + class_has_instances_list=(4,), + is_video_dataset=True, + colormap=_MOTCHALLENGE_COLORMAP, + is_depth_dataset=False, +) + +CITYSCAPES_DVPS_INFORMATION = DatasetDescriptor( + dataset_name=_CITYSCAPES_DVPS, + # The numbers of images are 2400/300/300 for train/val/test. Here, the + # sizes are the number of consecutive frame pairs. As each sequence has 6 + # frames, the number of pairs for the train split is 2400 / 6 * 5 = 2000. + # Similarly, we get 250 pairs for the val split and the test split. + splits_to_sizes={'train': 2000, + 'val': 250, + 'test': 250}, + num_classes=19, + ignore_label=255, + panoptic_label_divisor=1000, + class_has_instances_list=tuple(range(11, 19)), + is_video_dataset=True, + colormap=_CITYSCAPES_COLORMAP, + is_depth_dataset=True, +) + +COCO_PANOPTIC_INFORMATION = DatasetDescriptor( + dataset_name=_COCO_PANOPTIC, + splits_to_sizes={'train': 118287, + 'val': 5000, + 'test': 40670}, + num_classes=134, + ignore_label=0, + panoptic_label_divisor=256, + class_has_instances_list=tuple(range(1, 81)), + is_video_dataset=False, + colormap=_COCO_COLORMAP, + is_depth_dataset=False, +) + +MAP_NAME_TO_DATASET_INFO = { + _CITYSCAPES: CITYSCAPES_INFORMATION, + _CITYSCAPES_PANOPTIC: CITYSCAPES_PANOPTIC_INFORMATION, + _KITTI_STEP: KITTI_STEP_INFORMATION, + _MOTCHALLENGE_STEP: MOTCHALLENGE_STEP_INFORMATION, + _CITYSCAPES_DVPS: CITYSCAPES_DVPS_INFORMATION, + _COCO_PANOPTIC: COCO_PANOPTIC_INFORMATION, +} + +MAP_NAMES = list(MAP_NAME_TO_DATASET_INFO.keys()) diff --git a/data/dataset_utils.py b/data/dataset_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..167b30a6182cd49ee35f9b3245bf5f0cd9c810a6 --- /dev/null +++ b/data/dataset_utils.py @@ -0,0 +1,71 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains utility function for handling the dataset.""" + +import tensorflow as tf + + +def get_semantic_and_panoptic_label(dataset_info, label, ignore_label): + """Helper function to get semantic and panoptic label from panoptic label. + + This functions gets the semantic and panoptic label from panoptic label for + different datasets. The labels must be encoded with semantic_label * + label_divisor + instance_id. For thing classes, the instance ID 0 is reserved + for crowd regions. Please note, the returned panoptic label has replaced + the crowd region with ignore regions. Yet, the semantic label makes use of + these regions. + + Args: + dataset_info: A dictionary storing dataset information. + label: A Tensor of panoptic label. + ignore_label: An integer specifying the ignore_label. + + Returns: + semantic_label: A Tensor of semantic segmentation label. + panoptic_label: A Tensor of panoptic segmentation label, which follows the + Cityscapes annotation where + panoptic_label = semantic_label * panoptic_label_divisor + instance_id. + thing_mask: A boolean Tensor specifying the thing regions. Zero if no thing. + crowd_region: A boolean Tensor specifying crowd region. Zero if no crowd + annotation. + + Raises: + ValueError: An error occurs when the ignore_label is not in range + [0, label_divisor]. + """ + panoptic_label_divisor = dataset_info['panoptic_label_divisor'] + if ignore_label >= panoptic_label_divisor or ignore_label < 0: + raise ValueError('The ignore_label must be in [0, label_divisor].') + + semantic_label = label // panoptic_label_divisor + # Find iscrowd region if any and set to ignore for panoptic labels. + # 1. Find thing mask. + thing_mask = tf.zeros_like(semantic_label, tf.bool) + for thing_id in dataset_info['class_has_instances_list']: + thing_mask = tf.logical_or( + thing_mask, + tf.equal(semantic_label, thing_id)) + # 2. Find crowd region (thing label that have instance_id == 0). + crowd_region = tf.logical_and( + thing_mask, + tf.equal(label % panoptic_label_divisor, 0)) + # 3. Set crowd region to ignore label. + panoptic_label = tf.where( + crowd_region, + tf.ones_like(label) * ignore_label * panoptic_label_divisor, + label) + + return semantic_label, panoptic_label, thing_mask, crowd_region diff --git a/data/dataset_utils_test.py b/data/dataset_utils_test.py new file mode 100644 index 0000000000000000000000000000000000000000..f7538bf0cc6f7199ecbffee8be1e2d70a97b1524 --- /dev/null +++ b/data/dataset_utils_test.py @@ -0,0 +1,90 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for dataset_utils.""" + +import numpy as np +import tensorflow as tf + +from deeplab2.data import dataset_utils + + +class DatasetUtilsTest(tf.test.TestCase): + + def _get_test_labels(self, num_classes, shape, label_divisor): + num_ids_per_class = 35 + semantic_labels = np.random.randint(num_classes, size=shape) + panoptic_labels = np.random.randint( + num_ids_per_class, size=shape) + semantic_labels * label_divisor + + semantic_labels = tf.convert_to_tensor(semantic_labels, dtype=tf.int32) + panoptic_labels = tf.convert_to_tensor(panoptic_labels, dtype=tf.int32) + + return panoptic_labels, semantic_labels + + def setUp(self): + super().setUp() + self._first_thing_class = 9 + self._num_classes = 19 + self._dataset_info = { + 'panoptic_label_divisor': 1000, + 'class_has_instances_list': tf.range(self._first_thing_class, + self._num_classes) + } + self._num_ids = 37 + self._labels, self._semantic_classes = self._get_test_labels( + self._num_classes, [2, 33, 33], + self._dataset_info['panoptic_label_divisor']) + + def test_get_panoptic_and_semantic_label(self): + # Note: self._labels contains one crowd instance per class. + (returned_sem_labels, returned_pan_labels, returned_thing_mask, + returned_crowd_region) = ( + dataset_utils.get_semantic_and_panoptic_label( + self._dataset_info, self._labels, ignore_label=255)) + + expected_semantic_labels = self._semantic_classes + condition = self._labels % self._dataset_info['panoptic_label_divisor'] == 0 + condition = tf.logical_and( + condition, + tf.math.greater_equal(expected_semantic_labels, + self._first_thing_class)) + expected_crowd_labels = tf.where(condition, 1.0, 0.0) + expected_pan_labels = tf.where( + condition, 255 * self._dataset_info['panoptic_label_divisor'], + self._labels) + expected_thing_mask = tf.where( + tf.math.greater_equal(expected_semantic_labels, + self._first_thing_class), 1.0, 0.0) + + self.assertListEqual(returned_sem_labels.shape.as_list(), + expected_semantic_labels.shape.as_list()) + self.assertListEqual(returned_pan_labels.shape.as_list(), + expected_pan_labels.shape.as_list()) + self.assertListEqual(returned_crowd_region.shape.as_list(), + expected_crowd_labels.shape.as_list()) + self.assertListEqual(returned_thing_mask.shape.as_list(), + expected_thing_mask.shape.as_list()) + np.testing.assert_equal(returned_sem_labels.numpy(), + expected_semantic_labels.numpy()) + np.testing.assert_equal(returned_pan_labels.numpy(), + expected_pan_labels.numpy()) + np.testing.assert_equal(returned_crowd_region.numpy(), + expected_crowd_labels.numpy()) + np.testing.assert_equal(returned_thing_mask.numpy(), + expected_thing_mask.numpy()) + +if __name__ == '__main__': + tf.test.main() diff --git a/data/preprocessing/__init__.py b/data/preprocessing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03 --- /dev/null +++ b/data/preprocessing/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/data/preprocessing/autoaugment_policy.py b/data/preprocessing/autoaugment_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..cec895228580f1ea0f4f3b9e96ccb6d5bf288113 --- /dev/null +++ b/data/preprocessing/autoaugment_policy.py @@ -0,0 +1,75 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AutoAugment policy file. + +This file contains found auto-augment policy. + +Please cite or refer to the following papers for details: +- Ekin D Cubuk, Barret Zoph, Dandelion Mane, Vijay Vasudevan, and Quoc V Le. +"Autoaugment: Learning augmentation policies from data." In CVPR, 2019. + +- Ekin D Cubuk, Barret Zoph, Jonathon Shlens, and Quoc V Le. +"Randaugment: Practical automated data augmentation with a reduced search +space." In CVPR, 2020. +""" + +# Reduced augmentation operation space. +augmentation_reduced_operations = ( + 'AutoContrast', 'Equalize', 'Invert', 'Posterize', + 'Solarize', 'Color', 'Contrast', 'Brightness', 'Sharpness') + +augmentation_probabilities = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] + + +def convert_policy(policy, + search_space=augmentation_reduced_operations, + probability_scale=1.0, + magnitude_scale=1): + """Converts policy from a list of numbers.""" + if len(policy) % 6: + raise ValueError('Policy length must be a multiple of 6.') + num_policies = len(policy) // 6 + policy_list = [[] for _ in range(num_policies)] + for n in range(num_policies): + for i in range(2): + operation_id, prob_id, magnitude = ( + policy[6 * n + i * 3 : 6 * n + (i + 1) * 3]) + policy_name = search_space[operation_id] + policy_prob = ( + augmentation_probabilities[prob_id] * probability_scale) + policy_list[n].append((policy_name, + policy_prob, + magnitude * magnitude_scale)) + return policy_list + + +simple_classification_policy = [8, 2, 7, 7, 1, 10, + 1, 0, 9, 6, 1, 10, + 8, 1, 9, 5, 1, 9, + 4, 1, 7, 1, 3, 9, + 8, 1, 1, 1, 1, 7] + +# All available policies. +available_policies = { + 'simple_classification_policy_magnitude_scale_0.2': convert_policy( + simple_classification_policy, + augmentation_reduced_operations, + magnitude_scale=0.2), + 'simple_classification_policy': convert_policy( + simple_classification_policy, + augmentation_reduced_operations, + magnitude_scale=1), +} diff --git a/data/preprocessing/autoaugment_policy_test.py b/data/preprocessing/autoaugment_policy_test.py new file mode 100644 index 0000000000000000000000000000000000000000..e9b02a38cb3def5a7277f2142ca5a27d0552cdcb --- /dev/null +++ b/data/preprocessing/autoaugment_policy_test.py @@ -0,0 +1,43 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for autoaugment_policy.py.""" + +import tensorflow as tf + +from deeplab2.data.preprocessing import autoaugment_policy + + +class AutoaugmentPolicyTest(tf.test.TestCase): + + def testConvertPolicy(self): + policy = [5, 1, 10, 5, 3, 4, + 6, 3, 7, 3, 3, 9, + 2, 2, 8, 8, 2, 8, + 1, 4, 9, 4, 5, 7, + 6, 4, 1, 1, 3, 4] + expected = [ + [('Color', 0.2, 10), ('Color', 0.6, 4)], + [('Contrast', 0.6, 7), ('Posterize', 0.6, 9)], + [('Invert', 0.4, 8), ('Sharpness', 0.4, 8)], + [('Equalize', 0.8, 9), ('Solarize', 1.0, 7)], + [('Contrast', 0.8, 1), ('Equalize', 0.6, 4)], + ] + policy_list = autoaugment_policy.convert_policy(policy) + self.assertAllEqual(policy_list, expected) + + +if __name__ == '__main__': + tf.test.main() diff --git a/data/preprocessing/autoaugment_utils.py b/data/preprocessing/autoaugment_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3600b51b1018fc399158d0c4ebe1e772975a5c6a --- /dev/null +++ b/data/preprocessing/autoaugment_utils.py @@ -0,0 +1,422 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AutoAugment utility file. + +Please cite or refer to the following papers: +- Ekin D Cubuk, Barret Zoph, Dandelion Mane, Vijay Vasudevan, and Quoc V Le. +"Autoaugment: Learning augmentation policies from data." In CVPR, 2019. + +- Ekin D Cubuk, Barret Zoph, Jonathon Shlens, and Quoc V Le. +"Randaugment: Practical automated data augmentation with a reduced search +space." In CVPR, 2020. +""" + +import inspect + +import tensorflow as tf + +from deeplab2.data.preprocessing import autoaugment_policy + + +# This signifies the max integer that the controller RNN could predict for the +# augmentation scheme. +_MAX_LEVEL = 10. + + +def blend(image1, image2, factor): + """Blends image1 and image2 using 'factor'. + + Factor can be above 0.0. A value of 0.0 means only image1 is used. + A value of 1.0 means only image2 is used. A value between 0.0 and + 1.0 means we linearly interpolate the pixel values between the two + images. A value greater than 1.0 "extrapolates" the difference + between the two pixel values, and we clip the results to values + between 0 and 255. + + Args: + image1: An image Tensor of type uint8. + image2: An image Tensor of type uint8. + factor: A floating point value above 0.0. + + Returns: + A blended image Tensor of type uint8. + """ + if factor == 0.0: + return tf.convert_to_tensor(image1) + if factor == 1.0: + return tf.convert_to_tensor(image2) + + image1 = tf.cast(image1, tf.float32) + image2 = tf.cast(image2, tf.float32) + + difference = image2 - image1 + scaled = factor * difference + + # Do addition in float. + temp = tf.cast(image1, tf.float32) + scaled + + # Interpolate + if factor > 0.0 and factor < 1.0: + # Interpolation means we always stay within 0 and 255. + return tf.cast(temp, tf.uint8) + + # Extrapolate: + # + # We need to clip and then cast. + return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8) + + +def solarize(image, threshold=128): + # For each pixel in the image, select the pixel + # if the value is less than the threshold. + # Otherwise, subtract 255 from the pixel. + return tf.where(image < threshold, image, 255 - image) + + +def invert(image): + """Inverts the image pixels.""" + image = tf.convert_to_tensor(image) + return 255 - image + + +def color(image, factor): + """Equivalent of PIL Color.""" + degenerate = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image)) + return blend(degenerate, image, factor) + + +def contrast(image, factor): + """Equivalent of PIL Contrast.""" + degenerate = tf.image.rgb_to_grayscale(image) + # Cast before calling tf.histogram. + degenerate = tf.cast(degenerate, tf.int32) + + # Compute the grayscale histogram, then compute the mean pixel value, + # and create a constant image size of that value. Use that as the + # blending degenerate target of the original image. + hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256) + mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0 + degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean + degenerate = tf.clip_by_value(degenerate, 0.0, 255.0) + degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8)) + return blend(degenerate, image, factor) + + +def brightness(image, factor): + """Equivalent of PIL Brightness.""" + degenerate = tf.zeros_like(image) + return blend(degenerate, image, factor) + + +def posterize(image, bits): + """Equivalent of PIL Posterize.""" + shift = 8 - bits + return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift) + + +def autocontrast(image): + """Implements Autocontrast function from PIL using TF ops. + + Args: + image: A 3D uint8 tensor. + + Returns: + The image after it has had autocontrast applied to it and will be of type + uint8. + """ + + def scale_channel(image): + """Scale the 2D image using the autocontrast rule.""" + # A possibly cheaper version can be done using cumsum/unique_with_counts + # over the histogram values, rather than iterating over the entire image. + # to compute mins and maxes. + lo = tf.cast(tf.reduce_min(image), tf.float32) + hi = tf.cast(tf.reduce_max(image), tf.float32) + + # Scale the image, making the lowest value 0 and the highest value 255. + def scale_values(im): + scale = 255.0 / (hi - lo) + offset = -lo * scale + im = tf.cast(im, tf.float32) * scale + offset + im = tf.clip_by_value(im, 0.0, 255.0) + return tf.cast(im, tf.uint8) + + result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image) + return result + + # Assumes RGB for now. Scales each channel independently + # and then stacks the result. + s1 = scale_channel(image[:, :, 0]) + s2 = scale_channel(image[:, :, 1]) + s3 = scale_channel(image[:, :, 2]) + image = tf.stack([s1, s2, s3], 2) + return image + + +def sharpness(image, factor): + """Implements Sharpness function from PIL using TF ops.""" + orig_image = image + image = tf.cast(image, tf.float32) + # Make image 4D for conv operation. + image = tf.expand_dims(image, 0) + # SMOOTH PIL Kernel. + kernel = tf.constant( + [[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=tf.float32, + shape=[3, 3, 1, 1]) / 13. + # Tile across channel dimension. + kernel = tf.tile(kernel, [1, 1, 3, 1]) + strides = [1, 1, 1, 1] + degenerate = tf.nn.depthwise_conv2d( + image, kernel, strides, padding='VALID', dilations=[1, 1]) + degenerate = tf.clip_by_value(degenerate, 0.0, 255.0) + degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0]) + + # For the borders of the resulting image, fill in the values of the + # original image. + mask = tf.ones_like(degenerate) + padded_mask = tf.pad(mask, [[1, 1], [1, 1], [0, 0]]) + padded_degenerate = tf.pad(degenerate, [[1, 1], [1, 1], [0, 0]]) + result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image) + + # Blend the final result. + return blend(result, orig_image, factor) + + +def equalize(image): + """Implements Equalize function from PIL using TF ops.""" + def scale_channel(im, c): + """Scale the data in the channel to implement equalize.""" + im = tf.cast(im[:, :, c], tf.int32) + # Compute the histogram of the image channel. + histo = tf.histogram_fixed_width(im, [0, 255], nbins=256) + + # For the purposes of computing the step, filter out the nonzeros. + nonzero = tf.where(tf.not_equal(histo, 0)) + nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1]) + step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255 + + def build_lut(histo, step): + # Compute the cumulative sum, shifting by step // 2 + # and then normalization by step. + lut = (tf.cumsum(histo) + (step // 2)) // step + # Shift lut, prepending with 0. + lut = tf.concat([[0], lut[:-1]], 0) + # Clip the counts to be in range. This is done + # in the C code for image.point. + return tf.clip_by_value(lut, 0, 255) + + # If step is zero, return the original image. Otherwise, build + # lut from the full histogram and step and then index from it. + result = tf.cond(tf.equal(step, 0), + lambda: im, + lambda: tf.gather(build_lut(histo, step), im)) + + return tf.cast(result, tf.uint8) + + # Assumes RGB for now. Scales each channel independently + # and then stacks the result. + s1 = scale_channel(image, 0) + s2 = scale_channel(image, 1) + s3 = scale_channel(image, 2) + image = tf.stack([s1, s2, s3], 2) + return image + + +NAME_TO_FUNC = { + 'AutoContrast': autocontrast, + 'Equalize': equalize, + 'Invert': invert, + 'Posterize': posterize, + 'Solarize': solarize, + 'Color': color, + 'Contrast': contrast, + 'Brightness': brightness, + 'Sharpness': sharpness, +} + + +def _enhance_level_to_arg(level): + return ((level/_MAX_LEVEL) * 1.8 + 0.1,) + + +def level_to_arg(): + return { + 'AutoContrast': + lambda level: (), + 'Equalize': + lambda level: (), + 'Invert': + lambda level: (), + 'Posterize': lambda level: (int((level/_MAX_LEVEL) * 4),), + 'Solarize': lambda level: (int((level/_MAX_LEVEL) * 256),), + 'Color': + _enhance_level_to_arg, + 'Contrast': + _enhance_level_to_arg, + 'Brightness': + _enhance_level_to_arg, + 'Sharpness': + _enhance_level_to_arg, + } + + +def label_wrapper(func): + """Adds a label function argument to func and returns unchanged label.""" + def wrapper(images, label, *args, **kwargs): + return func(images, *args, **kwargs), label + return wrapper + + +def _parse_policy_info(name, prob, level, replace_value, ignore_label): + """Returns the function corresponding to `name` and update `level` param.""" + func = NAME_TO_FUNC[name] + args = level_to_arg()[name](level) + + if 'prob' in inspect.getfullargspec(func)[0]: + args = tuple([prob] + list(args)) + + # Add in replace arg if it is required for the function that is being called. + if 'replace' in inspect.getfullargspec(func)[0]: + # Make sure ignore_label is also in the argument. + assert 'ignore_label' in inspect.getfullargspec(func)[0] + # Make sure replace is the second from last argument + assert 'replace' == inspect.getfullargspec(func)[0][-2] + # Make sure ignore_label is the final argument + assert 'ignore_label' == inspect.getfullargspec(func)[0][-1] + args = tuple(list(args) + [replace_value, ignore_label]) + + # Add label as the second positional argument for the function if it does + # not already exist. + if 'label' not in inspect.getfullargspec(func)[0]: + func = label_wrapper(func) + return (func, prob, args) + + +def _apply_func_with_prob(func, image, args, prob, label): + """Apply `func` to image w/ `args` as input with probability `prob`.""" + assert isinstance(args, tuple) + assert 'label' == inspect.getfullargspec(func)[0][1] + + # If prob is a function argument, then this randomness is being handled + # inside the function, so make sure it is always called. + if 'prob' in inspect.getfullargspec(func)[0]: + prob = 1.0 + + # Apply the function with probability `prob`. + should_apply_op = tf.cast( + tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool) + augmented_image, augmented_label = tf.cond( + should_apply_op, + lambda: func(image, label, *args), + lambda: (image, label)) + return augmented_image, augmented_label + + +def select_and_apply_random_policy(policies, image, label): + """Select a random policy from `policies` and apply it to `image`.""" + policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32) + # Note that using tf.case instead of tf.conds would result in significantly + # larger graphs and would even break export for some larger policies. + for (i, policy) in enumerate(policies): + image, label = tf.cond( + tf.equal(i, policy_to_select), + lambda selected_policy=policy: selected_policy(image, label), + lambda: (image, label)) + return (image, label) + + +def build_and_apply_autoaugment_policy(policies, image, label, ignore_label): + """Builds a policy from the given policies passed in and applies to image. + + Args: + policies: list of lists of tuples in the form `(func, prob, level)`, `func` + is a string name of the augmentation function, `prob` is the probability + of applying the `func` operation, `level` is the input argument for + `func`. + image: tf.Tensor that the resulting policy will be applied to. + label: tf.Tensor that the resulting policy will be applied to. + ignore_label: The label value which will be ignored for training and + evaluation. + + Returns: + A version of image that now has data augmentation applied to it based on + the `policies` pass into the function. Additionally, returns bboxes if + a value for them is passed in that is not None + """ + replace_value = [128, 128, 128] + + # func is the string name of the augmentation function, prob is the + # probability of applying the operation and level is the parameter associated + # with the tf op. + + # tf_policies are functions that take in an image and return an augmented + # image. + tf_policies = [] + for policy in policies: + tf_policy = [] + # Link string name to the correct python function and make sure the correct + # argument is passed into that function. + for policy_info in policy: + policy_info = ( + list(policy_info) + [replace_value, ignore_label]) + + tf_policy.append(_parse_policy_info(*policy_info)) + # Now build the tf policy that will apply the augmentation procedue + # on image. + def make_final_policy(tf_policy_): + def final_policy(image_, label_): + for func, prob, args in tf_policy_: + image_, label_ = _apply_func_with_prob( + func, image_, args, prob, label_) + return image_, label_ + return final_policy + tf_policies.append(make_final_policy(tf_policy)) + + augmented_images, augmented_label = select_and_apply_random_policy( + tf_policies, image, label) + # If no bounding boxes were specified, then just return the images. + return (augmented_images, augmented_label) + + +def distort_image_with_autoaugment(image, + label, + ignore_label, + augmentation_name=None): + """Applies the AutoAugment policy to `image` and `label`. + + Args: + image: `Tensor` of shape [height, width, 3] representing an image. + label: `Tensor` of shape [height, width, 1] representing a label. + ignore_label: The label value which will be ignored for training and + evaluation. + augmentation_name: The name of the AutoAugment policy to use. See + autoaugment_policy.py for available_policies. + + Returns: + A tuple containing the augmented versions of `image` and `label`. + + Raises: + ValueError: If the augmentation_name is not in available_policies. + """ + if augmentation_name: + available_policies = autoaugment_policy.available_policies + if augmentation_name not in available_policies: + raise ValueError( + 'Invalid augmentation_name: {}'.format(augmentation_name)) + policy = available_policies[augmentation_name] + return build_and_apply_autoaugment_policy( + policy, image, label, ignore_label) + return image, label diff --git a/data/preprocessing/autoaugment_utils_test.py b/data/preprocessing/autoaugment_utils_test.py new file mode 100644 index 0000000000000000000000000000000000000000..5347198dd2cf21a4068c9df242497f63fa503f1b --- /dev/null +++ b/data/preprocessing/autoaugment_utils_test.py @@ -0,0 +1,40 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for autoaugment_utils.py.""" + +import numpy as np +import tensorflow as tf + +from deeplab2.data.preprocessing import autoaugment_utils + + +class AutoaugmentUtilsTest(tf.test.TestCase): + + def testAugmentWithNamedPolicy(self): + num_classes = 3 + np_image = np.random.randint(256, size=(13, 13, 3)) + image = tf.constant(np_image, dtype=tf.uint8) + np_label = np.random.randint(num_classes, size=(13, 13, 1)) + label = tf.constant(np_label, dtype=tf.int32) + image, label = autoaugment_utils.distort_image_with_autoaugment( + image, label, ignore_label=255, + augmentation_name='simple_classification_policy') + self.assertTrue(image.numpy().any()) + self.assertTrue(label.numpy().any()) + + +if __name__ == '__main__': + tf.test.main() diff --git a/data/preprocessing/input_preprocessing.py b/data/preprocessing/input_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..e44b68b4aee7c0a87c27e9dc4e0db7d84ad1d731 --- /dev/null +++ b/data/preprocessing/input_preprocessing.py @@ -0,0 +1,307 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains functions to preprocess images and labels.""" + +import tensorflow as tf + +from deeplab2.data.preprocessing import autoaugment_utils +from deeplab2.data.preprocessing import preprocess_utils + +# The probability of flipping the images and labels +# left-right during training +_PROB_OF_FLIP = 0.5 + +_MEAN_PIXEL = [127.5, 127.5, 127.5] + + +def _pad_image_and_label(image, label, offset_height, offset_width, + target_height, target_width, ignore_label=None): + """Pads the image and the label to the given size. + + Args: + image: A tf.Tensor of shape [height, width, channels]. + label: A tf.Tensor of shape [height, width, 1] or None. + offset_height: The number of rows of zeros to add on top of the image and + label. + offset_width: The number of columns of zeros to add on the left of the image + and label. + target_height: The total height after padding. + target_width: The total width after padding. + ignore_label: The ignore_label for the label. Must only be set when label is + given. + + Returns: + The padded image and label as a tuple (padded_image, padded_label). + + Raises: + tf.errors.InvalidArgumentError: An error occurs if the padding configuration + is invalid. + ValueError: An error occurs if label is given without an ignore_label. + """ + height = tf.shape(image)[0] + width = tf.shape(image)[1] + original_dtype = image.dtype + if original_dtype not in (tf.float32, tf.float64): + image = tf.cast(image, tf.float32) + + bottom_padding = target_height - offset_height - height + right_padding = target_width - offset_width - width + + assert_bottom_padding = tf.assert_greater( + bottom_padding, -1, + 'The padding configuration is not valid. Please either increase the ' + 'target size or reduce the padding offset.') + assert_right_padding = tf.assert_greater( + right_padding, -1, 'The padding configuration is not valid. Please either' + ' increase the target size or reduce the padding offset.') + with tf.control_dependencies([assert_bottom_padding, assert_right_padding]): + paddings = [[offset_height, bottom_padding], [offset_width, right_padding], + [0, 0]] + + image = image - _MEAN_PIXEL + image = tf.pad(image, paddings) + image = image + _MEAN_PIXEL + image = tf.cast(image, original_dtype) + + if label is not None: + if ignore_label is None: + raise ValueError( + 'If a label is given, the ignore label must be set too.') + label = tf.pad(label, paddings, constant_values=ignore_label) + + return image, label + + +def _update_max_resize_value(max_resize_value, crop_size, is_inference=False): + """Checks and may update max_resize_value. + + Args: + max_resize_value: A 2-tuple of (height, width), maximum allowed value + after resize. If a single element is given, then height and width + share the same value. None, empty or having 0 indicates no maximum value + will be used. + crop_size: A 2-tuple of (height, width), crop size used. + is_inference: Boolean, whether the model is performing inference or not. + + Returns: + Updated max_resize_value. + """ + max_resize_value = preprocess_utils.process_resize_value(max_resize_value) + if max_resize_value is None and is_inference: + # During inference, default max_resize_value to crop size to allow + # model taking input images with larger sizes. + max_resize_value = crop_size + + if max_resize_value is None: + return None + + if max_resize_value[0] > crop_size[0] or max_resize_value[1] > crop_size[1]: + raise ValueError( + 'Maximum resize value provided (%s) exceeds model crop size (%s)' % + (max_resize_value, crop_size)) + return max_resize_value + + +def preprocess_image_and_label(image, + label, + crop_height, + crop_width, + prev_image=None, + prev_label=None, + min_resize_value=None, + max_resize_value=None, + resize_factor=None, + min_scale_factor=1., + max_scale_factor=1., + scale_factor_step_size=0, + ignore_label=None, + is_training=True, + autoaugment_policy_name=None): + """Preprocesses the image and label. + + Args: + image: A tf.Tensor containing the image with shape [height, width, 3]. + label: A tf.Tensor containing the label with shape [height, width, 1] or + None. + crop_height: The height value used to crop the image and label. + crop_width: The width value used to crop the image and label. + prev_image: An optional tensor of shape [image_height, image_width, 3]. + prev_label: An optional tensor of shape [label_height, label_width, 1]. + min_resize_value: A 2-tuple of (height, width), desired minimum value + after resize. If a single element is given, then height and width share + the same value. None, empty or having 0 indicates no minimum value will + be used. + max_resize_value: A 2-tuple of (height, width), maximum allowed value + after resize. If a single element is given, then height and width + share the same value. None, empty or having 0 indicates no maximum value + will be used. + resize_factor: Resized dimensions are multiple of factor plus one. + min_scale_factor: Minimum scale factor for random scale augmentation. + max_scale_factor: Maximum scale factor for random scale augmentation. + scale_factor_step_size: The step size from min scale factor to max scale + factor. The input is randomly scaled based on the value of + (min_scale_factor, max_scale_factor, scale_factor_step_size). + ignore_label: The label value which will be ignored for training and + evaluation. + is_training: If the preprocessing is used for training or not. + autoaugment_policy_name: String, autoaugment policy name. See + autoaugment_policy.py for available policies. + + Returns: + resized_image: The resized input image without other augmentations as a + tf.Tensor. + processed_image: The preprocessed image as a tf.Tensor. + label: The preprocessed groundtruth segmentation label as a tf.Tensor. + + Raises: + ValueError: Ground truth label not provided during training. + """ + if is_training and label is None: + raise ValueError('During training, label must be provided.') + + image.get_shape().assert_is_compatible_with(tf.TensorShape([None, None, 3])) + + # Keep reference to original image. + resized_image = image + if prev_image is not None: + image = tf.concat([image, prev_image], axis=2) + processed_image = tf.cast(image, tf.float32) + processed_prev_image = None + + if label is not None: + label.get_shape().assert_is_compatible_with(tf.TensorShape([None, None, 1])) + if prev_label is not None: + label = tf.concat([label, prev_label], axis=2) + label = tf.cast(label, tf.int32) + + # Resize image and label to the desired range. + if any([min_resize_value, max_resize_value, not is_training]): + max_resize_value = _update_max_resize_value( + max_resize_value, + crop_size=(crop_height, crop_width), + is_inference=not is_training) + + processed_image, label = ( + preprocess_utils.resize_to_range( + image=processed_image, + label=label, + min_size=min_resize_value, + max_size=max_resize_value, + factor=resize_factor, + align_corners=True)) + if prev_image is None: + resized_image = tf.identity(processed_image) + else: + resized_image, _ = tf.split(processed_image, 2, axis=2) + + if prev_image is not None: + processed_image, processed_prev_image = tf.split(processed_image, 2, axis=2) + + if prev_label is not None: + label, prev_label = tf.split(label, 2, axis=2) + + if not is_training: + image_height = tf.shape(processed_image)[0] + image_width = tf.shape(processed_image)[1] + + offset_height = 0 + offset_width = 0 + processed_image, label = _pad_image_and_label(processed_image, label, + offset_height, offset_width, + crop_height, crop_width, + ignore_label) + processed_image.set_shape([crop_height, crop_width, 3]) + if label is not None: + label.set_shape([crop_height, crop_width, 1]) + if prev_image is not None: + processed_prev_image, prev_label = _pad_image_and_label( + processed_prev_image, prev_label, offset_height, offset_width, + crop_height, crop_width, ignore_label) + processed_prev_image.set_shape([crop_height, crop_width, 3]) + if prev_label is not None: + prev_label.set_shape([crop_height, crop_width, 1]) + return (resized_image, processed_image, label, processed_prev_image, + prev_label) + + # Data augmentation by randomly scaling the inputs. + scale = preprocess_utils.get_random_scale( + min_scale_factor, max_scale_factor, scale_factor_step_size) + processed_image, label = preprocess_utils.randomly_scale_image_and_label( + processed_image, label, scale) + if processed_prev_image is not None: + (processed_prev_image, + prev_label) = preprocess_utils.randomly_scale_image_and_label( + processed_prev_image, prev_label, scale) + + # Apply autoaugment if any. + if autoaugment_policy_name: + processed_image, label = _autoaugment_helper( + processed_image, label, ignore_label, autoaugment_policy_name) + if processed_prev_image is not None: + processed_prev_image, prev_label = _autoaugment_helper( + processed_prev_image, prev_label, ignore_label, + autoaugment_policy_name) + + # Pad image and label to have dimensions >= [crop_height, crop_width]. + image_height = tf.shape(processed_image)[0] + image_width = tf.shape(processed_image)[1] + target_height = image_height + tf.maximum(crop_height - image_height, 0) + target_width = image_width + tf.maximum(crop_width - image_width, 0) + + # Randomly crop the image and label. + def _uniform_offset(margin): + return tf.random.uniform( + [], minval=0, maxval=tf.maximum(margin, 1), dtype=tf.int32) + + offset_height = _uniform_offset(crop_height - image_height) + offset_width = _uniform_offset(crop_width - image_width) + processed_image, label = _pad_image_and_label(processed_image, label, + offset_height, offset_width, + target_height, target_width, + ignore_label) + if processed_prev_image is not None: + processed_prev_image, prev_label = _pad_image_and_label( + processed_prev_image, prev_label, offset_height, offset_width, + target_height, target_width, ignore_label) + + if processed_prev_image is not None: + (processed_image, label, processed_prev_image, + prev_label) = preprocess_utils.random_crop( + [processed_image, label, processed_prev_image, prev_label], + crop_height, crop_width) + # Randomly left-right flip the image and label. + (processed_image, label, processed_prev_image, prev_label, + _) = preprocess_utils.flip_dim( + [processed_image, label, processed_prev_image, prev_label], + _PROB_OF_FLIP, + dim=1) + else: + processed_image, label = preprocess_utils.random_crop( + [processed_image, label], crop_height, crop_width) + # Randomly left-right flip the image and label. + processed_image, label, _ = preprocess_utils.flip_dim( + [processed_image, label], _PROB_OF_FLIP, dim=1) + + return resized_image, processed_image, label, processed_prev_image, prev_label + + +def _autoaugment_helper(image, label, ignore_label, policy_name): + image = tf.cast(image, tf.uint8) + label = tf.cast(label, tf.int32) + image, label = autoaugment_utils.distort_image_with_autoaugment( + image, label, ignore_label, policy_name) + image = tf.cast(image, tf.float32) + return image, label diff --git a/data/preprocessing/input_preprocessing_test.py b/data/preprocessing/input_preprocessing_test.py new file mode 100644 index 0000000000000000000000000000000000000000..26a31b87d711e74c48e50c02ff5076ae5917279a --- /dev/null +++ b/data/preprocessing/input_preprocessing_test.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for input_preprocessing.""" + +import numpy as np +import tensorflow as tf + +from deeplab2.data.preprocessing import input_preprocessing + + +class InputPreprocessingTest(tf.test.TestCase): + + def setUp(self): + super().setUp() + self._image = tf.convert_to_tensor(np.random.randint(256, size=[33, 33, 3])) + self._label = tf.convert_to_tensor(np.random.randint(19, size=[33, 33, 1])) + + def test_cropping(self): + crop_height = np.random.randint(33) + crop_width = np.random.randint(33) + + original_image, processed_image, processed_label, prev_image, prev_label = ( + input_preprocessing.preprocess_image_and_label( + image=self._image, + label=self._label, + prev_image=tf.identity(self._image), + prev_label=tf.identity(self._label), + crop_height=crop_height, + crop_width=crop_width, + ignore_label=255)) + + self.assertListEqual(original_image.shape.as_list(), + [33, 33, 3]) + self.assertListEqual(processed_image.shape.as_list(), + [crop_height, crop_width, 3]) + self.assertListEqual(processed_label.shape.as_list(), + [crop_height, crop_width, 1]) + np.testing.assert_equal(processed_image.numpy(), prev_image.numpy()) + np.testing.assert_equal(processed_label.numpy(), prev_label.numpy()) + + def test_resizing(self): + height, width = 65, 65 + + original_image, processed_image, processed_label, prev_image, prev_label = ( + input_preprocessing.preprocess_image_and_label( + image=self._image, + label=self._label, + prev_image=tf.identity(self._image), + prev_label=tf.identity(self._label), + crop_height=height, + crop_width=width, + min_resize_value=65, + max_resize_value=65, + resize_factor=32, + ignore_label=255)) + + self.assertListEqual(original_image.shape.as_list(), + [height, width, 3]) + self.assertListEqual(processed_image.shape.as_list(), + [height, width, 3]) + self.assertListEqual(processed_label.shape.as_list(), + [height, width, 1]) + np.testing.assert_equal(processed_image.numpy(), prev_image.numpy()) + np.testing.assert_equal(processed_label.numpy(), prev_label.numpy()) + + def test_scaling(self): + height, width = 65, 65 + + original_image, processed_image, processed_label, prev_image, prev_label = ( + input_preprocessing.preprocess_image_and_label( + image=self._image, + label=self._label, + prev_image=tf.identity(self._image), + prev_label=tf.identity(self._label), + crop_height=height, + crop_width=width, + min_scale_factor=0.5, + max_scale_factor=2.0, + ignore_label=255)) + + self.assertListEqual(original_image.shape.as_list(), + [33, 33, 3]) + self.assertListEqual(processed_image.shape.as_list(), + [height, width, 3]) + self.assertListEqual(processed_label.shape.as_list(), + [height, width, 1]) + np.testing.assert_equal(processed_image.numpy(), prev_image.numpy()) + np.testing.assert_equal(processed_label.numpy(), prev_label.numpy()) + + def test_return_padded_image_and_label(self): + image = np.dstack([[[5, 6], [9, 0]], [[4, 3], [3, 5]], [[7, 8], [1, 2]]]) + image = tf.convert_to_tensor(image, dtype=tf.float32) + label = np.array([[[1], [2]], [[3], [4]]]) + expected_image = np.dstack([[[127.5, 127.5, 127.5, 127.5, 127.5], + [127.5, 127.5, 127.5, 127.5, 127.5], + [127.5, 5, 6, 127.5, 127.5], + [127.5, 9, 0, 127.5, 127.5], + [127.5, 127.5, 127.5, 127.5, 127.5]], + [[127.5, 127.5, 127.5, 127.5, 127.5], + [127.5, 127.5, 127.5, 127.5, 127.5], + [127.5, 4, 3, 127.5, 127.5], + [127.5, 3, 5, 127.5, 127.5], + [127.5, 127.5, 127.5, 127.5, 127.5]], + [[127.5, 127.5, 127.5, 127.5, 127.5], + [127.5, 127.5, 127.5, 127.5, 127.5], + [127.5, 7, 8, 127.5, 127.5], + [127.5, 1, 2, 127.5, 127.5], + [127.5, 127.5, 127.5, 127.5, 127.5]]]) + expected_label = np.array([[[255], [255], [255], [255], [255]], + [[255], [255], [255], [255], [255]], + [[255], [1], [2], [255], [255]], + [[255], [3], [4], [255], [255]], + [[255], [255], [255], [255], [255]]]) + + padded_image, padded_label = input_preprocessing._pad_image_and_label( + image, label, 2, 1, 5, 5, 255) + np.testing.assert_allclose(padded_image.numpy(), expected_image) + np.testing.assert_allclose(padded_label.numpy(), expected_label) + + def test_return_original_image_when_target_size_is_equal_to_image_size(self): + height, width, _ = tf.shape(self._image) + padded_image, _ = input_preprocessing._pad_image_and_label( + self._image, None, 0, 0, height, width) + np.testing.assert_allclose(padded_image.numpy(), self._image) + + def test_die_on_target_size_greater_than_image_size(self): + height, width, _ = tf.shape(self._image) + with self.assertRaises(tf.errors.InvalidArgumentError): + _ = input_preprocessing._pad_image_and_label(self._image, None, 0, 0, + height, width - 1) + + with self.assertRaises(tf.errors.InvalidArgumentError): + _ = input_preprocessing._pad_image_and_label(self._image, None, 0, 0, + height - 1, width) + + def test_die_if_target_size_not_possible_with_given_offset(self): + height, width, _ = tf.shape(self._image) + with self.assertRaises(tf.errors.InvalidArgumentError): + _ = input_preprocessing._pad_image_and_label(self._image, None, 3, 3, + height + 2, width + 2) + + def test_set_min_resize_value_only_during_training(self): + crop_height = np.random.randint(33) + crop_width = np.random.randint(33) + + _, processed_image, _, _, _ = ( + input_preprocessing.preprocess_image_and_label( + image=self._image, + label=self._label, + crop_height=crop_height, + crop_width=crop_width, + min_resize_value=[10], + max_resize_value=None, + ignore_label=255)) + + self.assertListEqual(processed_image.shape.as_list(), + [crop_height, crop_width, 3]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/data/preprocessing/preprocess_utils.py b/data/preprocessing/preprocess_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d3773001558eb4cb1ced3adaf7e73c47d3dc7f0d --- /dev/null +++ b/data/preprocessing/preprocess_utils.py @@ -0,0 +1,516 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions related to preprocessing inputs.""" + +import numpy as np +import tensorflow as tf + + +def flip_dim(tensor_list, prob=0.5, dim=1): + """Randomly flips a dimension of the given tensor. + + The decision to randomly flip the `Tensors` is made together. In other words, + all or none of the images pass in are flipped. + + Note that tf.random_flip_left_right and tf.random_flip_up_down isn't used so + that we can control for the probability as well as ensure the same decision + is applied across the images. + + Args: + tensor_list: A list of `Tensors` with the same number of dimensions. + prob: The probability of a left-right flip. + dim: The dimension to flip, 0, 1, .. + + Returns: + outputs: A list of the possibly flipped `Tensors` as well as an indicator + `Tensor` at the end whose value is `True` if the inputs were flipped and + `False` otherwise. + + Raises: + ValueError: If dim is negative or greater than the dimension of a `Tensor`. + """ + random_value = tf.random.uniform([]) + + def flip(): + flipped = [] + for tensor in tensor_list: + if dim < 0 or dim >= len(tensor.get_shape().as_list()): + raise ValueError('dim must represent a valid dimension.') + flipped.append(tf.reverse(tensor, [dim])) + return flipped + + is_flipped = tf.less_equal(random_value, prob) + outputs = tf.cond(is_flipped, flip, lambda: tensor_list) + if not isinstance(outputs, (list, tuple)): + outputs = [outputs] + outputs.append(is_flipped) + + return outputs + + +def get_label_resize_method(label): + """Returns the resize method of labels depending on label dtype. + + Args: + label: Groundtruth label tensor. + + Returns: + tf.image.ResizeMethod.BILINEAR, if label dtype is floating. + tf.image.ResizeMethod.NEAREST_NEIGHBOR, if label dtype is integer. + + Raises: + ValueError: If label is neither floating nor integer. + """ + if label.dtype.is_floating: + return tf.image.ResizeMethod.BILINEAR + elif label.dtype.is_integer: + return tf.image.ResizeMethod.NEAREST_NEIGHBOR + else: + raise ValueError('Label type must be either floating or integer.') + + +def _crop(image, offset_height, offset_width, crop_height, crop_width): + """Crops the given image using the provided offsets and sizes. + + Note that the method doesn't assume we know the input image size but it does + assume we know the input image rank. + + Args: + image: an image of shape [height, width, channels]. + offset_height: a scalar tensor indicating the height offset. + offset_width: a scalar tensor indicating the width offset. + crop_height: the height of the cropped image. + crop_width: the width of the cropped image. + + Returns: + The cropped (and resized) image. + + Raises: + ValueError: if `image` doesn't have rank of 3. + InvalidArgumentError: if the rank is not 3 or if the image dimensions are + less than the crop size. + """ + original_shape = tf.shape(image) + + if len(image.get_shape().as_list()) != 3: + raise ValueError('input must have rank of 3') + original_channels = image.get_shape().as_list()[2] + + rank_assertion = tf.Assert( + tf.equal(tf.rank(image), 3), + ['Rank of image must be equal to 3.']) + with tf.control_dependencies([rank_assertion]): + cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]]) + + size_assertion = tf.Assert( + tf.logical_and( + tf.greater_equal(original_shape[0], crop_height), + tf.greater_equal(original_shape[1], crop_width)), + ['Crop size greater than the image size.']) + + offsets = tf.cast(tf.stack([offset_height, offset_width, 0]), tf.int32) + + # Use tf.slice instead of crop_to_bounding box as it accepts tensors to + # define the crop size. + with tf.control_dependencies([size_assertion]): + image = tf.slice(image, offsets, cropped_shape) + image = tf.reshape(image, cropped_shape) + image.set_shape([crop_height, crop_width, original_channels]) + return image + + +def random_crop(image_list, crop_height, crop_width): + """Crops the given list of images. + + The function applies the same crop to each image in the list. This can be + effectively applied when there are multiple image inputs of the same + dimension such as: + + image, depths, normals = random_crop([image, depths, normals], 120, 150) + + Args: + image_list: a list of image tensors of the same dimension but possibly + varying channel. + crop_height: the new height. + crop_width: the new width. + + Returns: + the image_list with cropped images. + + Raises: + ValueError: if there are multiple image inputs provided with different size + or the images are smaller than the crop dimensions. + """ + if not image_list: + raise ValueError('Empty image_list.') + + # Compute the rank assertions. + rank_assertions = [] + for i in range(len(image_list)): + image_rank = tf.rank(image_list[i]) + rank_assert = tf.Assert( + tf.equal(image_rank, 3), [ + 'Wrong rank for tensor %d in image_list [expected] [actual]', i, 3, + image_rank + ]) + rank_assertions.append(rank_assert) + + with tf.control_dependencies([rank_assertions[0]]): + image_shape = tf.shape(image_list[0]) + image_height = image_shape[0] + image_width = image_shape[1] + crop_size_assert = tf.Assert( + tf.logical_and( + tf.greater_equal(image_height, crop_height), + tf.greater_equal(image_width, crop_width)), + ['Crop size greater than the image size.']) + + asserts = [rank_assertions[0], crop_size_assert] + + for i in range(1, len(image_list)): + image = image_list[i] + asserts.append(rank_assertions[i]) + with tf.control_dependencies([rank_assertions[i]]): + shape = tf.shape(image) + height = shape[0] + width = shape[1] + + height_assert = tf.Assert( + tf.equal(height, image_height), [ + 'Wrong height for tensor %d in image_list [expected][actual]', i, + height, image_height + ]) + width_assert = tf.Assert( + tf.equal(width, image_width), [ + 'Wrong width for tensor %d in image_list [expected][actual]', i, + width, image_width + ]) + asserts.extend([height_assert, width_assert]) + + # Create a random bounding box. + # + # Use tf.random.uniform and not numpy.random.rand as doing the former would + # generate random numbers at graph eval time, unlike the latter which + # generates random numbers at graph definition time. + with tf.control_dependencies(asserts): + max_offset_height = tf.reshape(image_height - crop_height + 1, []) + max_offset_width = tf.reshape(image_width - crop_width + 1, []) + offset_height = tf.random.uniform([], + maxval=max_offset_height, + dtype=tf.int32) + offset_width = tf.random.uniform([], maxval=max_offset_width, dtype=tf.int32) + + return [_crop(image, offset_height, offset_width, + crop_height, crop_width) for image in image_list] + + +def get_random_scale(min_scale_factor, max_scale_factor, step_size): + """Gets a random scale value. + + Args: + min_scale_factor: Minimum scale value. + max_scale_factor: Maximum scale value. + step_size: The step size from minimum to maximum value. + + Returns: + A tensor with random scale value selected between minimum and maximum value. + If `min_scale_factor` and `max_scale_factor` are the same, a number is + returned instead. + + Raises: + ValueError: min_scale_factor has unexpected value. + """ + if min_scale_factor < 0 or min_scale_factor > max_scale_factor: + raise ValueError('Unexpected value of min_scale_factor.') + + if min_scale_factor == max_scale_factor: + return np.float32(min_scale_factor) + + # When step_size = 0, we sample the value uniformly from [min, max). + if step_size == 0: + return tf.random.uniform([1], + minval=min_scale_factor, + maxval=max_scale_factor) + + # When step_size != 0, we randomly select one discrete value from [min, max]. + num_steps = int((max_scale_factor - min_scale_factor) / step_size + 1) + scale_factors = tf.linspace(min_scale_factor, max_scale_factor, num_steps) + shuffled_scale_factors = tf.random.shuffle(scale_factors) + return shuffled_scale_factors[0] + + +def randomly_scale_image_and_label(image, label=None, scale=1.0): + """Randomly scales image and label. + + Args: + image: Image with shape [height, width, 3]. + label: Label with shape [height, width, 1]. + scale: The value to scale image and label. + + Returns: + Scaled image and label. + """ + # No random scaling if scale == 1. + if scale == 1.0: + return image, label + image_shape = tf.shape(image) + new_dim = tf.cast( + tf.cast([image_shape[0], image_shape[1]], tf.float32) * scale, + tf.int32) + + # Need squeeze and expand_dims because image interpolation takes + # 4D tensors as input. + image = tf.squeeze( + tf.compat.v1.image.resize_bilinear( + tf.expand_dims(image, 0), new_dim, align_corners=True), [0]) + if label is not None: + label = tf.compat.v1.image.resize( + label, + new_dim, + method=get_label_resize_method(label), + align_corners=True) + + return image, label + + +def resolve_shape(tensor, rank=None): + """Fully resolves the shape of a Tensor. + + Use as much as possible the shape components already known during graph + creation and resolve the remaining ones during runtime. + + Args: + tensor: Input tensor whose shape we query. + rank: The rank of the tensor, provided that we know it. + + Returns: + shape: The full shape of the tensor. + """ + if rank is not None: + shape = tensor.get_shape().with_rank(rank).as_list() + else: + shape = tensor.get_shape().as_list() + + if None in shape: + dynamic_shape = tf.shape(tensor) + for i in range(len(shape)): + if shape[i] is None: + shape[i] = dynamic_shape[i] + + return shape + + +def _scale_dim(original_size, factor): + """Helper method to scale one input dimension by the given factor.""" + original_size = tf.cast(original_size, tf.float32) + factor = tf.cast(factor, tf.float32) + return tf.cast(tf.floor(original_size * factor), tf.int32) + + +def process_resize_value(resize_spec): + """Helper method to process input resize spec. + + Args: + resize_spec: Either None, a python scalar, or a sequence with length <=2. + Each value in the sequence should be a python integer. + + Returns: + None if input size is not valid, or 2-tuple of (height, width), derived + from input resize_spec. + """ + if not resize_spec: + return None + + if isinstance(resize_spec, int): + # For conveniences and also backward compatibility. + resize_spec = (resize_spec,) + + resize_spec = tuple(resize_spec) + + if len(resize_spec) == 1: + resize_spec = (resize_spec[0], resize_spec[0]) + + if len(resize_spec) != 2: + raise ValueError('Unable to process input resize_spec: %s' % resize_spec) + + if resize_spec[0] <= 0 or resize_spec[1] <= 0: + return None + + return resize_spec + + +def _resize_to_match_min_size(input_shape, min_size): + """Returns the resized shape so that both sides match minimum size. + + Note: the input image will still be scaled if input height and width + are already greater than minimum size. + + Args: + input_shape: A 2-tuple, (height, width) of the input image. Each value can + be either a python integer or a integer scalar tensor. + min_size: A tuple of (minimum height, minimum width) to specify the + minimum shape after resize. The input shape would be scaled so that both + height and width will be greater than or equal to their minimum value. + + Returns: + A 2-tuple, (height, width), resized input shape which preserves input + aspect ratio. + """ + input_height, input_width = input_shape + min_height, min_width = min_size + + scale_factor = tf.maximum(min_height / input_height, min_width / input_width) + return (_scale_dim(input_height, scale_factor), + _scale_dim(input_width, scale_factor)) + + +def _resize_to_fit_max_size(input_shape, max_size): + """Returns the resized shape so that both sides fit within max size. + + Note: if input shape is already smaller or equal to maximum size, no resize + operation would be performed. + + Args: + input_shape: A 2-tuple, (height, width) of the input image. Each value can + be either a python integer or a integer scalar tensor. + max_size: A tuple of (minimum height, minimum width) to specify + the maximum allowed shape after resize. + + Returns: + A 2-tuple, (height, width), resized input shape which preserves input + aspect ratio. + """ + input_height, input_width = input_shape + max_height, max_width = max_size + scale_factor = tf.minimum(max_height / input_height, max_width / input_width) + + scale_factor = tf.minimum(tf.cast(scale_factor, tf.float32), + tf.cast(1.0, tf.float32)) + return (_scale_dim(input_height, scale_factor), + _scale_dim(input_width, scale_factor)) + + +def resize_to_range_helper(input_shape, min_size, max_size=None, factor=None): + """Determines output size in specified range. + + The output size (height and/or width) can be described by two cases: + 1. If current side can be rescaled so its minimum size is equal to min_size + without the other side exceeding its max_size, then do so. + 2. Otherwise, resize so at least one side is reaching its max_size. + + An integer in `range(factor)` is added to the computed sides so that the + final dimensions are multiples of `factor` plus one. + + Args: + input_shape: A 2-tuple, (height, width) of the input image. Each value can + be either a python integer or a integer scalar tensor. + min_size: A 2-tuple of (height, width), desired minimum value after resize. + If a single element is given, then height and width share the same + min_size. None, empty or having 0 indicates no minimum value will be used. + max_size: A 2-tuple of (height, width), maximum allowed value after resize. + If a single element is given, then height and width share the same + max_size. None, empty or having 0 indicates no maximum value will be used. + Note that the output dimension is no larger than max_size and may be + slightly smaller than max_size when factor is not None. + factor: None or integer, make output size multiple of factor plus one. + + Returns: + A 1-D tensor containing the [new_height, new_width]. + """ + output_shape = input_shape + + min_size = process_resize_value(min_size) + if min_size: + output_shape = _resize_to_match_min_size(input_shape, min_size) + + max_size = process_resize_value(max_size) + if max_size: + if factor: + # Update max_size to be a multiple of factor plus 1 and make sure the + # max dimension after resizing is no larger than max_size. + max_size = (max_size[0] - (max_size[0] - 1) % factor, + max_size[1] - (max_size[1] - 1) % factor) + + output_shape = _resize_to_fit_max_size(output_shape, max_size) + + output_shape = tf.stack(output_shape) + # Ensure that both output sides are multiples of factor plus one. + if factor: + output_shape += (factor - (output_shape - 1) % factor) % factor + + return output_shape + + +def resize_to_range(image, + label=None, + min_size=None, + max_size=None, + factor=None, + align_corners=True, + method=tf.image.ResizeMethod.BILINEAR): + """Resizes image or label so their sides are within the provided range. + + The output size (height and/or width) can be described by two cases: + 1. If current side can be rescaled so its minimum size is equal to min_size + without the other side exceeding its max_size, then do so. + 2. Otherwise, resize so at least one side is reaching its max_size. + + An integer in `range(factor)` is added to the computed sides so that the + final dimensions are multiples of `factor` plus one. + + Args: + image: A 3D tensor of shape [height, width, channels]. + label: (optional) A 3D tensor of shape [height, width, channels]. + min_size: A 2-tuple of (height, width), desired minimum value after resize. + If a single element is given, then height and width share the same + min_size. None, empty or having 0 indicates no minimum value will be used. + max_size: A 2-tuple of (height, width), maximum allowed value after resize. + If a single element is given, then height and width share the same + max_size. None, empty or having 0 indicates no maximum value will be used. + Note that the output dimension is no larger than max_size and may be + slightly smaller than max_size when factor is not None. + factor: Make output size multiple of factor plus one. + align_corners: If True, exactly align all 4 corners of input and output. + method: Image resize method. Defaults to tf.image.ResizeMethod.BILINEAR. + + Returns: + resized_image: A 3-D tensor of shape [new_height, new_width, channels], + where the image has been resized with the specified method. + resized_label: Either None (if input label is None) or a 3-D tensor, + where the input label has been resized accordingly. + + Raises: + ValueError: If the image is not a 3D tensor. + """ + orig_height, orig_width, _ = resolve_shape(image, rank=3) + new_size = resize_to_range_helper(input_shape=(orig_height, orig_width), + min_size=min_size, + max_size=max_size, + factor=factor) + + resized_image = tf.compat.v1.image.resize( + image, new_size, method=method, align_corners=align_corners) + + if label is None: + return resized_image, None + + resized_label = tf.compat.v1.image.resize( + label, + new_size, + method=get_label_resize_method(label), + align_corners=align_corners) + + return resized_image, resized_label diff --git a/data/preprocessing/preprocess_utils_test.py b/data/preprocessing/preprocess_utils_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7bb0aa5a70c550148ad9c9cc4d69e225d4522dbc --- /dev/null +++ b/data/preprocessing/preprocess_utils_test.py @@ -0,0 +1,349 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for preprocess_utils.""" +import numpy as np +import tensorflow as tf + +from deeplab2.data.preprocessing import preprocess_utils + + +class PreprocessUtilsTest(tf.test.TestCase): + + def testNoFlipWhenProbIsZero(self): + numpy_image = np.dstack([[[5., 6.], + [9., 0.]], + [[4., 3.], + [3., 5.]]]) + image = tf.convert_to_tensor(numpy_image) + + actual, is_flipped = preprocess_utils.flip_dim([image], prob=0, dim=0) + self.assertAllEqual(numpy_image, actual) + self.assertFalse(is_flipped) + actual, is_flipped = preprocess_utils.flip_dim([image], prob=0, dim=1) + self.assertAllEqual(numpy_image, actual) + self.assertFalse(is_flipped) + actual, is_flipped = preprocess_utils.flip_dim([image], prob=0, dim=2) + self.assertAllEqual(numpy_image, actual) + self.assertFalse(is_flipped) + + def testFlipWhenProbIsOne(self): + numpy_image = np.dstack([[[5., 6.], + [9., 0.]], + [[4., 3.], + [3., 5.]]]) + dim0_flipped = np.dstack([[[9., 0.], + [5., 6.]], + [[3., 5.], + [4., 3.]]]) + dim1_flipped = np.dstack([[[6., 5.], + [0., 9.]], + [[3., 4.], + [5., 3.]]]) + dim2_flipped = np.dstack([[[4., 3.], + [3., 5.]], + [[5., 6.], + [9., 0.]]]) + image = tf.convert_to_tensor(numpy_image) + + actual, is_flipped = preprocess_utils.flip_dim([image], prob=1, dim=0) + self.assertAllEqual(dim0_flipped, actual) + self.assertTrue(is_flipped) + actual, is_flipped = preprocess_utils.flip_dim([image], prob=1, dim=1) + self.assertAllEqual(dim1_flipped, actual) + self.assertTrue(is_flipped) + actual, is_flipped = preprocess_utils.flip_dim([image], prob=1, dim=2) + self.assertAllEqual(dim2_flipped, actual) + self.assertTrue(is_flipped) + + def testFlipMultipleImagesConsistentlyWhenProbIsOne(self): + numpy_image = np.dstack([[[5., 6.], + [9., 0.]], + [[4., 3.], + [3., 5.]]]) + numpy_label = np.dstack([[[0., 1.], + [2., 3.]]]) + image_dim1_flipped = np.dstack([[[6., 5.], + [0., 9.]], + [[3., 4.], + [5., 3.]]]) + label_dim1_flipped = np.dstack([[[1., 0.], + [3., 2.]]]) + image = tf.convert_to_tensor(numpy_image) + label = tf.convert_to_tensor(numpy_label) + + image, label, is_flipped = preprocess_utils.flip_dim( + [image, label], prob=1, dim=1) + self.assertAllEqual(image_dim1_flipped, image) + self.assertAllEqual(label_dim1_flipped, label) + self.assertTrue(is_flipped) + + def testReturnRandomFlipsOnMultipleEvals(self): + numpy_image = np.dstack([[[5., 6.], + [9., 0.]], + [[4., 3.], + [3., 5.]]]) + dim1_flipped = np.dstack([[[6., 5.], + [0., 9.]], + [[3., 4.], + [5., 3.]]]) + image = tf.convert_to_tensor(numpy_image) + original_image, not_flipped = preprocess_utils.flip_dim( + [image], prob=0, dim=1) + flip_image, is_flipped = preprocess_utils.flip_dim( + [image], prob=1.0, dim=1) + self.assertAllEqual(numpy_image, original_image) + self.assertFalse(not_flipped) + self.assertAllEqual(dim1_flipped, flip_image) + self.assertTrue(is_flipped) + + def testReturnCorrectCropOfSingleImage(self): + np.random.seed(0) + + height, width = 10, 20 + image = np.random.randint(0, 256, size=(height, width, 3)) + + crop_height, crop_width = 2, 4 + + [cropped] = preprocess_utils.random_crop([tf.convert_to_tensor(image)], + crop_height, + crop_width) + + # Ensure we can find the cropped image in the original: + is_found = False + for x in range(0, width - crop_width + 1): + for y in range(0, height - crop_height + 1): + if np.isclose(image[y:y+crop_height, x:x+crop_width, :], + cropped).all(): + is_found = True + break + + self.assertTrue(is_found) + + def testRandomCropMaintainsNumberOfChannels(self): + np.random.seed(0) + + crop_height, crop_width = 10, 20 + image = np.random.randint(0, 256, size=(100, 200, 3)) + + tf.random.set_seed(37) + [cropped] = preprocess_utils.random_crop( + [tf.convert_to_tensor(image)], crop_height, crop_width) + + self.assertListEqual(cropped.shape.as_list(), [crop_height, crop_width, 3]) + + def testReturnDifferentCropAreasOnTwoEvals(self): + tf.random.set_seed(0) + + crop_height, crop_width = 2, 3 + image = np.random.randint(0, 256, size=(100, 200, 3)) + [cropped0] = preprocess_utils.random_crop( + [tf.convert_to_tensor(image)], crop_height, crop_width) + [cropped1] = preprocess_utils.random_crop( + [tf.convert_to_tensor(image)], crop_height, crop_width) + + self.assertFalse(np.isclose(cropped0.numpy(), cropped1.numpy()).all()) + + def testReturnConsistenCropsOfImagesInTheList(self): + tf.random.set_seed(0) + + height, width = 10, 20 + crop_height, crop_width = 2, 3 + labels = np.linspace(0, height * width-1, height * width) + labels = labels.reshape((height, width, 1)) + image = np.tile(labels, (1, 1, 3)) + + [cropped_image, cropped_label] = preprocess_utils.random_crop( + [tf.convert_to_tensor(image), tf.convert_to_tensor(labels)], + crop_height, crop_width) + + for i in range(3): + self.assertAllEqual(cropped_image[:, :, i], tf.squeeze(cropped_label)) + + def testDieOnRandomCropWhenImagesWithDifferentWidth(self): + crop_height, crop_width = 2, 3 + image1 = tf.convert_to_tensor(np.random.rand(4, 5, 3)) + image2 = tf.convert_to_tensor(np.random.rand(4, 6, 1)) + + with self.assertRaises(tf.errors.InvalidArgumentError): + _ = preprocess_utils.random_crop([image1, image2], crop_height, + crop_width) + + def testDieOnRandomCropWhenImagesWithDifferentHeight(self): + crop_height, crop_width = 2, 3 + image1 = tf.convert_to_tensor(np.random.rand(4, 5, 3)) + image2 = tf.convert_to_tensor(np.random.rand(5, 5, 1)) + + with self.assertRaises(tf.errors.InvalidArgumentError): + _ = preprocess_utils.random_crop([image1, image2], crop_height, + crop_width) + + def testDieOnRandomCropWhenCropSizeIsGreaterThanImage(self): + crop_height, crop_width = 5, 9 + image1 = tf.convert_to_tensor(np.random.rand(4, 5, 3)) + image2 = tf.convert_to_tensor(np.random.rand(4, 5, 1)) + + with self.assertRaises(tf.errors.InvalidArgumentError): + _ = preprocess_utils.random_crop([image1, image2], crop_height, + crop_width) + + def testRandomScaleFitsInRange(self): + scale_value = preprocess_utils.get_random_scale(1., 2., 0.) + self.assertGreaterEqual(scale_value, 1.) + self.assertLessEqual(scale_value, 2.) + + def testDeterminedRandomScaleReturnsNumber(self): + scale = preprocess_utils.get_random_scale(1., 1., 0.) + self.assertEqual(scale, 1.) + + def testResizeTensorsToRange(self): + test_shapes = [[60, 40], + [15, 30], + [15, 50]] + min_size = 50 + max_size = 100 + factor = None + expected_shape_list = [(75, 50, 3), + (50, 100, 3), + (30, 100, 3)] + for i, test_shape in enumerate(test_shapes): + image = tf.random.normal([test_shape[0], test_shape[1], 3]) + new_tensor_list = preprocess_utils.resize_to_range( + image=image, + label=None, + min_size=min_size, + max_size=max_size, + factor=factor, + align_corners=True) + self.assertEqual(new_tensor_list[0].shape, expected_shape_list[i]) + + def testResizeTensorsToRangeWithFactor(self): + test_shapes = [[60, 40], + [15, 30], + [15, 50]] + min_size = 50 + max_size = 98 + factor = 8 + expected_image_shape_list = [(81, 57, 3), + (49, 97, 3), + (33, 97, 3)] + expected_label_shape_list = [(81, 57, 1), + (49, 97, 1), + (33, 97, 1)] + for i, test_shape in enumerate(test_shapes): + image = tf.random.normal([test_shape[0], test_shape[1], 3]) + label = tf.random.normal([test_shape[0], test_shape[1], 1]) + new_tensor_list = preprocess_utils.resize_to_range( + image=image, + label=label, + min_size=min_size, + max_size=max_size, + factor=factor, + align_corners=True) + self.assertEqual(new_tensor_list[0].shape, expected_image_shape_list[i]) + self.assertEqual(new_tensor_list[1].shape, expected_label_shape_list[i]) + + def testResizeTensorsToRangeWithSimilarMinMaxSizes(self): + test_shapes = [[60, 40], + [15, 30], + [15, 50]] + # Values set so that one of the side = 97. + min_size = 96 + max_size = 98 + factor = 8 + expected_image_shape_list = [(97, 65, 3), + (49, 97, 3), + (33, 97, 3)] + expected_label_shape_list = [(97, 65, 1), + (49, 97, 1), + (33, 97, 1)] + for i, test_shape in enumerate(test_shapes): + image = tf.random.normal([test_shape[0], test_shape[1], 3]) + label = tf.random.normal([test_shape[0], test_shape[1], 1]) + new_tensor_list = preprocess_utils.resize_to_range( + image=image, + label=label, + min_size=min_size, + max_size=max_size, + factor=factor, + align_corners=True) + self.assertEqual(new_tensor_list[0].shape, expected_image_shape_list[i]) + self.assertEqual(new_tensor_list[1].shape, expected_label_shape_list[i]) + + def testResizeTensorsToRangeWithEqualMaxSize(self): + test_shapes = [[97, 38], + [96, 97]] + # Make max_size equal to the larger value of test_shapes. + min_size = 97 + max_size = 97 + factor = 8 + expected_image_shape_list = [(97, 41, 3), + (97, 97, 3)] + expected_label_shape_list = [(97, 41, 1), + (97, 97, 1)] + for i, test_shape in enumerate(test_shapes): + image = tf.random.normal([test_shape[0], test_shape[1], 3]) + label = tf.random.normal([test_shape[0], test_shape[1], 1]) + new_tensor_list = preprocess_utils.resize_to_range( + image=image, + label=label, + min_size=min_size, + max_size=max_size, + factor=factor, + align_corners=True) + self.assertEqual(new_tensor_list[0].shape, expected_image_shape_list[i]) + self.assertEqual(new_tensor_list[1].shape, expected_label_shape_list[i]) + + def testResizeTensorsToRangeWithPotentialErrorInTFCeil(self): + test_shape = [3936, 5248] + # Make max_size equal to the larger value of test_shapes. + min_size = 1441 + max_size = 1441 + factor = 16 + expected_image_shape = (1089, 1441, 3) + expected_label_shape = (1089, 1441, 1) + image = tf.random.normal([test_shape[0], test_shape[1], 3]) + label = tf.random.normal([test_shape[0], test_shape[1], 1]) + new_tensor_list = preprocess_utils.resize_to_range( + image=image, + label=label, + min_size=min_size, + max_size=max_size, + factor=factor, + align_corners=True) + self.assertEqual(new_tensor_list[0].shape, expected_image_shape) + self.assertEqual(new_tensor_list[1].shape, expected_label_shape) + + def testResizeTensorWithOnlyMaxSize(self): + test_shapes = [[97, 38], + [96, 18]] + + max_size = (97, 28) + # Since the second test shape already fits max size, do nothing. + expected_image_shape_list = [(71, 28, 3), + (96, 18, 3)] + for i, test_shape in enumerate(test_shapes): + image = tf.random.normal([test_shape[0], test_shape[1], 3]) + new_tensor_list = preprocess_utils.resize_to_range( + image=image, + label=None, + min_size=None, + max_size=max_size, + align_corners=True) + self.assertEqual(new_tensor_list[0].shape, expected_image_shape_list[i]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/data/sample_generator.py b/data/sample_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..bc08f6f69057c8da060060596b0b06ccac67a4c6 --- /dev/null +++ b/data/sample_generator.py @@ -0,0 +1,651 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains code to get a sample from a dataset.""" + +import functools + +import numpy as np +import tensorflow as tf + +from deeplab2 import common +from deeplab2.data import dataset_utils +from deeplab2.data.preprocessing import input_preprocessing as preprocessing + + +def _compute_gaussian_from_std(sigma): + """Computes the Gaussian and its size from a given standard deviation.""" + size = int(6 * sigma + 3) + x = np.arange(size, dtype=np.float) + y = x[:, np.newaxis] + x0, y0 = 3 * sigma + 1, 3 * sigma + 1 + return np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)), size + + +class PanopticSampleGenerator: + """This class generates samples from images and labels.""" + + def __init__(self, + dataset_info, + is_training, + crop_size, + min_resize_value=None, + max_resize_value=None, + resize_factor=None, + min_scale_factor=1., + max_scale_factor=1., + scale_factor_step_size=0, + autoaugment_policy_name=None, + only_semantic_annotations=False, + thing_id_mask_annotations=False, + max_thing_id=128, + sigma=8, + focus_small_instances=None): + """Initializes the panoptic segmentation generator. + + Args: + dataset_info: A dictionary with the following keys. + - `name`: String, dataset name. + - `ignore_label`: Integer, ignore label. + - `class_has_instances_list`: A list of integers indicating which + class has instance annotations. + - `panoptic_label_divisor`: Integer, panoptic label divisor. + - `num_classes`: Integer, number of classes. + - `is_video_dataset`: Boolean, is video dataset or not. + is_training: Boolean, is training mode or not. + crop_size: Image crop size [height, width]. + min_resize_value: A 2-tuple of (height, width), desired minimum value + after resize. If a single element is given, then height and width share + the same value. None, empty or having 0 indicates no minimum value will + be used. + max_resize_value: A 2-tuple of (height, width), maximum allowed value + after resize. If a single element is given, then height and width + share the same value. None, empty or having 0 indicates no maximum + value will be used. + resize_factor: Resized dimensions are multiple of factor plus one. + min_scale_factor: Minimum scale factor for random scale augmentation. + max_scale_factor: Maximum scale factor for random scale augmentation. + scale_factor_step_size: The step size from min scale factor to max scale + factor. The input is randomly scaled based on the value of + (min_scale_factor, max_scale_factor, scale_factor_step_size). + autoaugment_policy_name: String, autoaugment policy name. See + autoaugment_policy.py for available policies. + only_semantic_annotations: An optional flag indicating whether the model + needs only semantic annotations (default: False). + thing_id_mask_annotations: An optional flag indicating whether the model + needs thing_id_mask annotations. When `thing_id_mask_annotations` is + True, we will additionally return mask annotation for each `thing` + instance, encoded with a unique thing_id. This ground-truth annotation + could be used to learn a better segmentation mask for each instance. + `thing_id` indicates the number of unique thing-ID to each instance in + an image, starting the counting from 0 (default: False). + max_thing_id: The maximum number of possible thing instances per image. It + is used together when thing_id_mask_annotations = True, representing the + maximum thing ID encoded in the thing_id_mask. (default: 128). + sigma: The standard deviation of the Gaussian used to encode the center + keypoint (default: 8). + focus_small_instances: An optional dict that defines how to deal with + small instances (default: None): + -`threshold`: An integer defining the threshold pixel number for an + instance to be considered small. + -`weight`: A number that defines the loss weight for small instances. + """ + self._dataset_info = dataset_info + self._ignore_label = self._dataset_info['ignore_label'] + self._only_semantic_annotations = only_semantic_annotations + self._sigma = sigma + self._instance_area_threshold = 0 + self._small_instance_weight = 1.0 + self._thing_id_mask_annotations = thing_id_mask_annotations + self._max_thing_id = max_thing_id + self._is_training = is_training + self._preprocessing_fn = functools.partial( + preprocessing.preprocess_image_and_label, + crop_height=crop_size[0], + crop_width=crop_size[1], + min_resize_value=min_resize_value, + max_resize_value=max_resize_value, + resize_factor=resize_factor, + min_scale_factor=min_scale_factor, + max_scale_factor=max_scale_factor, + scale_factor_step_size=scale_factor_step_size, + autoaugment_policy_name=autoaugment_policy_name, + ignore_label=self._ignore_label * + self._dataset_info['panoptic_label_divisor'], + is_training=self._is_training) + + if focus_small_instances is not None: + self._instance_area_threshold = focus_small_instances['threshold'] + self._small_instance_weight = focus_small_instances['weight'] + + self._gaussian, self._gaussian_size = _compute_gaussian_from_std( + self._sigma) + self._gaussian = tf.cast(tf.reshape(self._gaussian, [-1]), tf.float32) + + def __call__(self, sample_dict): + """Gets a sample. + + Args: + sample_dict: A dictionary with the following keys and values: + - `image`: A tensor of shape [image_height, image_width, 3]. + - `image_name`: String, image name. + - `label`: A tensor of shape [label_height, label_width, 1] or None. + - `height`: An integer specifying the height of the image. + - `width`: An integer specifying the width of the image. + - `sequence`: An optional string specifying the sequence name. + - `prev_image`: An optional tensor of the same shape as `image`. + - `prev_label`: An optional tensor of the same shape as `label`. + - `next_image`: An optional next-frame tensor of the shape of `image`. + - `next_label`: An optional next-frame tensor of the shape of `label`. + + Returns: + sample: A dictionary storing required data for panoptic segmentation. + """ + return self.call(**sample_dict) + + def call(self, + image, + image_name, + label, + height, + width, + sequence='', + prev_image=None, + prev_label=None, + next_image=None, + next_label=None): + """Gets a sample. + + Args: + image: A tensor of shape [image_height, image_width, 3]. + image_name: String, image name. + label: A tensor of shape [label_height, label_width, 1] or None. + height: An integer specifying the height of the image. + width: An integer specifying the width of the image. + sequence: An optional string specifying the sequence name. + prev_image: An optional tensor of shape [image_height, image_width, 3]. + prev_label: An optional tensor of shape [label_height, label_width, 1]. + next_image: An optional tensor of shape [image_height, image_width, 3]. + next_label: An optional tensor of shape [label_height, label_width, 1]. + + Returns: + sample: A dictionary storing required data for panoptic segmentation. + + Raises: + ValueError: An error occurs when the label shape is invalid. + NotImplementedError: An error occurs when thing_id_mask_annotations comes + together with prev_image or prev_label, not currently implemented. + """ + if label is not None: + label.get_shape().assert_is_compatible_with( + tf.TensorShape([None, None, 1])) + original_label = tf.cast(label, dtype=tf.int32, name='original_label') + if next_label is not None: + original_next_label = tf.cast( + next_label, dtype=tf.int32, name='original_next_label') + # Reusing the preprocessing function for both next and prev samples. + if next_image is not None: + resized_image, image, label, next_image, next_label = ( + self._preprocessing_fn( + image, label, prev_image=next_image, prev_label=next_label)) + else: + resized_image, image, label, prev_image, prev_label = ( + self._preprocessing_fn( + image, label, prev_image=prev_image, prev_label=prev_label)) + sample = { + common.IMAGE: image + } + if prev_image is not None: + sample[common.IMAGE] = tf.concat([image, prev_image], axis=2) + if next_image is not None: + sample[common.NEXT_IMAGE] = next_image + sample[common.IMAGE] = tf.concat([image, next_image], axis=2) + if label is not None: + # Panoptic label for crowd regions will be ignore_label. + semantic_label, panoptic_label, thing_mask, crowd_region = ( + dataset_utils.get_semantic_and_panoptic_label( + self._dataset_info, label, self._ignore_label)) + sample[common.GT_SEMANTIC_KEY] = tf.squeeze(semantic_label, axis=2) + semantic_weights = tf.ones_like(semantic_label, dtype=tf.float32) + sample[common.SEMANTIC_LOSS_WEIGHT_KEY] = tf.squeeze( + semantic_weights, axis=2) + sample[common.GT_IS_CROWD] = tf.squeeze(crowd_region, axis=2) + + if not self._only_semantic_annotations: + # The sample will have the original label including crowd regions. + sample[common.GT_PANOPTIC_KEY] = tf.squeeze(label, axis=2) + # Compute center loss for all non-crowd and non-ignore pixels. + non_crowd_and_non_ignore_regions = tf.logical_and( + tf.logical_not(crowd_region), + tf.not_equal(semantic_label, self._ignore_label)) + sample[common.CENTER_LOSS_WEIGHT_KEY] = tf.squeeze(tf.cast( + non_crowd_and_non_ignore_regions, tf.float32), axis=2) + # Compute regression loss only for thing pixels that are not crowd. + non_crowd_things = tf.logical_and( + tf.logical_not(crowd_region), thing_mask) + sample[common.REGRESSION_LOSS_WEIGHT_KEY] = tf.squeeze(tf.cast( + non_crowd_things, tf.float32), axis=2) + + prev_panoptic_label = None + next_panoptic_label = None + if prev_label is not None: + _, prev_panoptic_label, _, _ = ( + dataset_utils.get_semantic_and_panoptic_label( + self._dataset_info, prev_label, self._ignore_label)) + if next_label is not None: + _, next_panoptic_label, _, _ = ( + dataset_utils.get_semantic_and_panoptic_label( + self._dataset_info, next_label, self._ignore_label)) + (sample[common.GT_INSTANCE_CENTER_KEY], + sample[common.GT_INSTANCE_REGRESSION_KEY], + sample[common.SEMANTIC_LOSS_WEIGHT_KEY], + prev_center_map, + frame_center_offsets, + next_offset) = self._generate_gt_center_and_offset( + panoptic_label, semantic_weights, prev_panoptic_label, + next_panoptic_label) + + sample[common.GT_INSTANCE_REGRESSION_KEY] = tf.cast( + sample[common.GT_INSTANCE_REGRESSION_KEY], tf.float32) + + if next_label is not None: + sample[common.GT_NEXT_INSTANCE_REGRESSION_KEY] = tf.cast( + next_offset, tf.float32) + sample[common.NEXT_REGRESSION_LOSS_WEIGHT_KEY] = tf.cast( + tf.greater(tf.reduce_sum(tf.abs(next_offset), axis=2), 0), + tf.float32) + + # Only squeeze center map and semantic loss weights, as regression map + # has two channels (x and y offsets). + sample[common.GT_INSTANCE_CENTER_KEY] = tf.squeeze( + sample[common.GT_INSTANCE_CENTER_KEY], axis=2) + sample[common.SEMANTIC_LOSS_WEIGHT_KEY] = tf.squeeze( + sample[common.SEMANTIC_LOSS_WEIGHT_KEY], axis=2) + + if prev_label is not None: + sample[common.GT_FRAME_OFFSET_KEY] = frame_center_offsets + sample[common.GT_FRAME_OFFSET_KEY] = tf.cast( + sample[common.GT_FRAME_OFFSET_KEY], tf.float32) + frame_offsets_present = tf.logical_or( + tf.not_equal(frame_center_offsets[..., 0], 0), + tf.not_equal(frame_center_offsets[..., 1], 0)) + sample[common.FRAME_REGRESSION_LOSS_WEIGHT_KEY] = tf.cast( + frame_offsets_present, tf.float32) + if self._is_training: + sample[common.IMAGE] = tf.concat( + [sample[common.IMAGE], prev_center_map], axis=2) + + if self._thing_id_mask_annotations: + if any([prev_image is not None, + prev_label is not None, + next_image is not None, + next_label is not None]): + raise NotImplementedError( + 'Current implementation of Max-DeepLab does not support ' + + 'prev_image, prev_label, next_image, or next_label.') + thing_id_mask, thing_id_class = ( + self._generate_thing_id_mask_and_class( + panoptic_label, non_crowd_things)) + sample[common.GT_THING_ID_MASK_KEY] = tf.squeeze( + thing_id_mask, axis=2) + sample[common.GT_THING_ID_CLASS_KEY] = thing_id_class + + if not self._is_training: + # Resized image is only used during visualization. + sample[common.RESIZED_IMAGE] = resized_image + sample[common.IMAGE_NAME] = image_name + sample[common.GT_SIZE_RAW] = tf.stack([height, width], axis=0) + if self._dataset_info['is_video_dataset']: + sample[common.SEQUENCE_ID] = sequence + # Keep original labels for evaluation. + if label is not None: + orig_semantic_label, _, _, orig_crowd_region = ( + dataset_utils.get_semantic_and_panoptic_label( + self._dataset_info, original_label, self._ignore_label)) + sample[common.GT_SEMANTIC_RAW] = tf.squeeze(orig_semantic_label, axis=2) + if not self._only_semantic_annotations: + sample[common.GT_PANOPTIC_RAW] = tf.squeeze(original_label, axis=2) + sample[common.GT_IS_CROWD_RAW] = tf.squeeze(orig_crowd_region) + if next_label is not None: + sample[common.GT_NEXT_PANOPTIC_RAW] = tf.squeeze( + original_next_label, axis=2) + return sample + + def _generate_thing_id_mask_and_class(self, + panoptic_label, + non_crowd_things): + """Generates the ground-truth thing-ID masks and their class labels. + + It computes thing-ID mask and class with unique ID for each thing instance. + `thing_id` indicates the number of unique thing-ID to each instance in an + image, starting the counting from 0. Each pixel in thing_id_mask is labeled + with the corresponding thing-ID. + + Args: + panoptic_label: A tf.Tensor of shape [height, width, 1]. + non_crowd_things: A tf.Tensor of shape [height, width, 1], indicating + non-crowd and thing-class regions. + + Returns: + thing_id_mask: A tf.Tensor of shape [height, width, 1]. It assigns each + non-crowd thing instance a unique mask-ID label, starting from 0. + Unassigned pixels are set to -1. + thing_id_class: A tf.Tensor of shape [max_thing_id]. It contains semantic + ID of each instance assigned to thing_id_mask. The remaining + (max_thing_id - num_things) elements are set to -1. + + Raises: + ValueError: An error occurs when the thing-ID mask contains stuff or crowd + region. + ValueError: An error occurs when thing_count is greater or equal to + self._max_thing_id. + + """ + unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1])) + thing_id_mask = -tf.ones_like(panoptic_label) + thing_id_class = -tf.ones(self._max_thing_id) + thing_count = 0 + for panoptic_id in unique_ids: + semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor'] + # Filter out IDs that are not thing instances (i.e., IDs for ignore_label, + # stuff classes or crowd). Stuff classes and crowd regions both have IDs + # of the form panoptic_id = semantic_id * label_divisor (i.e., instance id + # = 0) + if (semantic_id == self._dataset_info['ignore_label'] or + panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0): + continue + + assert_stuff_crowd = tf.debugging.Assert( + tf.reduce_all(non_crowd_things[panoptic_label == panoptic_id]), + ['thing-ID mask here must not contain stuff or crowd region.']) + with tf.control_dependencies([assert_stuff_crowd]): + panoptic_id = tf.identity(panoptic_id) + + thing_id_mask = tf.where(panoptic_label == panoptic_id, + thing_count, thing_id_mask) + + assert_thing_count = tf.debugging.Assert( + thing_count < self._max_thing_id, + ['thing_count must be smaller than self._max_thing_id.']) + with tf.control_dependencies([assert_thing_count]): + thing_count = tf.identity(thing_count) + + thing_id_class = tf.tensor_scatter_nd_update( + thing_id_class, [[thing_count]], [semantic_id]) + thing_count += 1 + return thing_id_mask, thing_id_class + + def _generate_prev_centers_with_noise(self, + panoptic_label, + offset_noise_factor=0.05, + false_positive_rate=0.2, + false_positive_noise_factor=0.05): + """Generates noisy center predictions for the previous frame. + + Args: + panoptic_label: A tf.Tensor of shape [height, width, 1]. + offset_noise_factor: An optional float defining the maximum fraction of + the object size that is used to displace the previous center. + false_positive_rate: An optional float indicating at which probability + false positives should be added. + false_positive_noise_factor: An optional float defining the maximum + fraction of the object size that is used to displace the false positive + center. + + Returns: + A tuple of (center, ids_to_center) with both being tf.Tensor of shape + [height, width, 1] and shape [N, 2] where N is the number of unique IDs. + """ + height = tf.shape(panoptic_label)[0] + width = tf.shape(panoptic_label)[1] + + # Pad center to make boundary handling easier. + center_pad_begin = int(round(3 * self._sigma + 1)) + center_pad_end = int(round(3 * self._sigma + 2)) + center_pad = center_pad_begin + center_pad_end + + center = tf.zeros((height + center_pad, width + center_pad)) + unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1])) + ids_to_center_x = tf.zeros_like(unique_ids, dtype=tf.int32) + ids_to_center_y = tf.zeros_like(unique_ids, dtype=tf.int32) + + for panoptic_id in unique_ids: + semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor'] + # Filter out IDs that should be ignored, are stuff classes or crowd. + # Stuff classes and crowd regions both have IDs of the form panoptic_id = + # semantic_id * label_divisor + if (semantic_id == self._dataset_info['ignore_label'] or + panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0): + continue + + # Convert [[y0, x0, 0], ...] to [[y0, ...], [x0, ...], [0, ...]]. + mask_index = tf.cast( + tf.transpose(tf.where(panoptic_label == panoptic_id)), tf.float32) + centers = tf.reduce_mean(mask_index, axis=1) + bbox_size = ( + tf.reduce_max(mask_index, axis=1) - tf.reduce_min(mask_index, axis=1)) + + # Add noise. + center_y = ( + centers[0] + tf.random.normal([], dtype=tf.float32) * + offset_noise_factor * bbox_size[0]) + center_x = ( + centers[1] + tf.random.normal([], dtype=tf.float32) * + offset_noise_factor * bbox_size[1]) + + center_x = tf.minimum( + tf.maximum(tf.cast(tf.round(center_x), tf.int32), 0), width - 1) + center_y = tf.minimum( + tf.maximum(tf.cast(tf.round(center_y), tf.int32), 0), height - 1) + + id_index = tf.where(tf.equal(panoptic_id, unique_ids)) + ids_to_center_x = tf.tensor_scatter_nd_update( + ids_to_center_x, id_index, tf.expand_dims(center_x, axis=0)) + ids_to_center_y = tf.tensor_scatter_nd_update( + ids_to_center_y, id_index, tf.expand_dims(center_y, axis=0)) + + def add_center_gaussian(center_x_coord, center_y_coord, center): + # Due to the padding with center_pad_begin in center, the computed + # center becomes the upper left corner in the center tensor. + upper_left = center_x_coord, center_y_coord + bottom_right = (upper_left[0] + self._gaussian_size, + upper_left[1] + self._gaussian_size) + + indices_x, indices_y = tf.meshgrid( + tf.range(upper_left[0], bottom_right[0]), + tf.range(upper_left[1], bottom_right[1])) + indices = tf.transpose( + tf.stack([tf.reshape(indices_y, [-1]), + tf.reshape(indices_x, [-1])])) + + return tf.tensor_scatter_nd_max( + center, indices, self._gaussian, name='center_scatter') + + center = add_center_gaussian(center_x, center_y, center) + # Generate false positives. + center_y = ( + tf.cast(center_y, dtype=tf.float32) + + tf.random.normal([], dtype=tf.float32) * false_positive_noise_factor * + bbox_size[0]) + center_x = ( + tf.cast(center_x, dtype=tf.float32) + + tf.random.normal([], dtype=tf.float32) * false_positive_noise_factor * + bbox_size[1]) + + center_x = tf.minimum( + tf.maximum(tf.cast(tf.round(center_x), tf.int32), 0), width - 1) + center_y = tf.minimum( + tf.maximum(tf.cast(tf.round(center_y), tf.int32), 0), height - 1) + # Draw a sample to decide whether to add a false positive or not. + center = center + tf.cast( + tf.random.uniform([], dtype=tf.float32) < false_positive_rate, + tf.float32) * ( + add_center_gaussian(center_x, center_y, center) - center) + + center = center[center_pad_begin:(center_pad_begin + height), + center_pad_begin:(center_pad_begin + width)] + center = tf.expand_dims(center, -1) + return center, unique_ids, ids_to_center_x, ids_to_center_y + + def _generate_gt_center_and_offset(self, + panoptic_label, + semantic_weights, + prev_panoptic_label=None, + next_panoptic_label=None): + """Generates the ground-truth center and offset from the panoptic labels. + + Additionally, the per-pixel weights for the semantic branch are increased + for small instances. In case, prev_panoptic_label is passed, it also + computes the previous center heatmap with random noise and the offsets + between center maps. + + Args: + panoptic_label: A tf.Tensor of shape [height, width, 1]. + semantic_weights: A tf.Tensor of shape [height, width, 1]. + prev_panoptic_label: An optional tf.Tensor of shape [height, width, 1]. + next_panoptic_label: An optional tf.Tensor of shape [height, width, 1]. + + Returns: + A tuple (center, offsets, weights, prev_center, frame_offset*, + next_offset) with each being a tf.Tensor of shape [height, width, 1 (2*)]. + If prev_panoptic_label is None, prev_center and frame_offset are None. + If next_panoptic_label is None, next_offset is None. + """ + height = tf.shape(panoptic_label)[0] + width = tf.shape(panoptic_label)[1] + + # Pad center to make boundary handling easier. + center_pad_begin = int(round(3 * self._sigma + 1)) + center_pad_end = int(round(3 * self._sigma + 2)) + center_pad = center_pad_begin + center_pad_end + + center = tf.zeros((height + center_pad, width + center_pad)) + offset_x = tf.zeros((height, width, 1), dtype=tf.int32) + offset_y = tf.zeros((height, width, 1), dtype=tf.int32) + unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1])) + + prev_center = None + frame_offsets = None + # Due to loop handling in tensorflow, these variables had to be defined for + # all cases. + frame_offset_x = tf.zeros((height, width, 1), dtype=tf.int32) + frame_offset_y = tf.zeros((height, width, 1), dtype=tf.int32) + + # Next-frame instance offsets. + next_offset = None + next_offset_y = tf.zeros((height, width, 1), dtype=tf.int32) + next_offset_x = tf.zeros((height, width, 1), dtype=tf.int32) + + if prev_panoptic_label is not None: + (prev_center, prev_unique_ids, prev_centers_x, prev_centers_y + ) = self._generate_prev_centers_with_noise(prev_panoptic_label) + + for panoptic_id in unique_ids: + semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor'] + # Filter out IDs that should be ignored, are stuff classes or crowd. + # Stuff classes and crowd regions both have IDs of the form panopti_id = + # semantic_id * label_divisor + if (semantic_id == self._dataset_info['ignore_label'] or + panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0): + continue + + # Convert [[y0, x0, 0], ...] to [[y0, ...], [x0, ...], [0, ...]]. + mask_index = tf.transpose(tf.where(panoptic_label == panoptic_id)) + mask_y_index = mask_index[0] + mask_x_index = mask_index[1] + + next_mask_index = None + next_mask_y_index = None + next_mask_x_index = None + if next_panoptic_label is not None: + next_mask_index = tf.transpose( + tf.where(next_panoptic_label == panoptic_id)) + next_mask_y_index = next_mask_index[0] + next_mask_x_index = next_mask_index[1] + + instance_area = tf.shape(mask_x_index) + if instance_area < self._instance_area_threshold: + semantic_weights = tf.where(panoptic_label == panoptic_id, + self._small_instance_weight, + semantic_weights) + + centers = tf.reduce_mean(tf.cast(mask_index, tf.float32), axis=1) + + center_x = tf.cast(tf.round(centers[1]), tf.int32) + center_y = tf.cast(tf.round(centers[0]), tf.int32) + + # Due to the padding with center_pad_begin in center, the computed center + # becomes the upper left corner in the center tensor. + upper_left = center_x, center_y + bottom_right = (upper_left[0] + self._gaussian_size, + upper_left[1] + self._gaussian_size) + + indices_x, indices_y = tf.meshgrid( + tf.range(upper_left[0], bottom_right[0]), + tf.range(upper_left[1], bottom_right[1])) + indices = tf.transpose( + tf.stack([tf.reshape(indices_y, [-1]), + tf.reshape(indices_x, [-1])])) + + center = tf.tensor_scatter_nd_max( + center, indices, self._gaussian, name='center_scatter') + offset_y = tf.tensor_scatter_nd_update( + offset_y, + tf.transpose(mask_index), + center_y - tf.cast(mask_y_index, tf.int32), + name='offset_y_scatter') + offset_x = tf.tensor_scatter_nd_update( + offset_x, + tf.transpose(mask_index), + center_x - tf.cast(mask_x_index, tf.int32), + name='offset_x_scatter') + if prev_panoptic_label is not None: + mask = tf.equal(prev_unique_ids, panoptic_id) + if tf.math.count_nonzero(mask) > 0: + prev_center_x = prev_centers_x[mask] + prev_center_y = prev_centers_y[mask] + + frame_offset_y = tf.tensor_scatter_nd_update( + frame_offset_y, + tf.transpose(mask_index), + prev_center_y - tf.cast(mask_y_index, tf.int32), + name='frame_offset_y_scatter') + frame_offset_x = tf.tensor_scatter_nd_update( + frame_offset_x, + tf.transpose(mask_index), + prev_center_x - tf.cast(mask_x_index, tf.int32), + name='frame_offset_x_scatter') + if next_panoptic_label is not None: + next_offset_y = tf.tensor_scatter_nd_update( + next_offset_y, + tf.transpose(next_mask_index), + center_y - tf.cast(next_mask_y_index, tf.int32), + name='next_offset_y_scatter') + next_offset_x = tf.tensor_scatter_nd_update( + next_offset_x, + tf.transpose(next_mask_index), + center_x - tf.cast(next_mask_x_index, tf.int32), + name='next_offset_x_scatter') + + offset = tf.concat([offset_y, offset_x], axis=2) + center = center[center_pad_begin:(center_pad_begin + height), + center_pad_begin:(center_pad_begin + width)] + center = tf.expand_dims(center, -1) + if prev_panoptic_label is not None: + frame_offsets = tf.concat([frame_offset_y, frame_offset_x], axis=2) + if next_panoptic_label is not None: + next_offset = tf.concat([next_offset_y, next_offset_x], axis=2) + return (center, offset, semantic_weights, prev_center, frame_offsets, + next_offset) diff --git a/data/sample_generator_test.py b/data/sample_generator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..8fa3cb3cbd1a3104aca5ad6fa0e909956a914f8b --- /dev/null +++ b/data/sample_generator_test.py @@ -0,0 +1,274 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for sample_generator.""" + +import os + +from absl import flags +import numpy as np +from PIL import Image +import tensorflow as tf + +from deeplab2 import common +from deeplab2.data import data_utils +from deeplab2.data import dataset +from deeplab2.data import sample_generator + +image_utils = tf.keras.preprocessing.image + +flags.DEFINE_string( + 'panoptic_annotation_data', + 'deeplab2/data/testdata/', + 'Path to annotated test image.') +flags.DEFINE_bool('update_golden_data', False, + 'Whether or not to update the golden data for testing.') + +FLAGS = flags.FLAGS + +_FILENAME_PREFIX = 'dummy_000000_000000' +_IMAGE_FOLDER = 'leftImg8bit/' +_TARGET_FOLDER = 'targets/' + + +def _get_groundtruth_image(computed_image_array, groundtruth_image_filename): + if FLAGS.update_golden_data: + image = Image.fromarray(tf.squeeze(computed_image_array).numpy()) + with tf.io.gfile.GFile(groundtruth_image_filename, mode='wb') as fp: + image.save(fp) + return computed_image_array + + with tf.io.gfile.GFile(groundtruth_image_filename, mode='rb') as fp: + image = data_utils.read_image(fp.read()) + # If loaded image has 3 channels, the returned shape is [height, width, 3]. + # If loaded image has 1 channel, the returned shape is [height, width]. + image = np.squeeze(image_utils.img_to_array(image)) + return image + + +def _get_groundtruth_array(computed_image_array, groundtruth_image_filename): + if FLAGS.update_golden_data: + with tf.io.gfile.GFile(groundtruth_image_filename, mode='wb') as fp: + np.save(fp, computed_image_array) + return computed_image_array + with tf.io.gfile.GFile(groundtruth_image_filename, mode='rb') as fp: + # If loaded data has C>1 channels, the returned shape is [height, width, C]. + # If loaded data has 1 channel, the returned shape is [height, width]. + array = np.squeeze(np.load(fp)) + return array + + +class PanopticSampleGeneratorTest(tf.test.TestCase): + + def setUp(self): + super().setUp() + self._test_img_data_dir = os.path.join( + FLAGS.test_srcdir, + FLAGS.panoptic_annotation_data, + _IMAGE_FOLDER) + self._test_gt_data_dir = os.path.join( + FLAGS.test_srcdir, + FLAGS.panoptic_annotation_data) + self._test_target_data_dir = os.path.join( + FLAGS.test_srcdir, + FLAGS.panoptic_annotation_data, + _TARGET_FOLDER) + image_path = self._test_img_data_dir + _FILENAME_PREFIX + '_leftImg8bit.png' + with tf.io.gfile.GFile(image_path, 'rb') as image_file: + rgb_image = data_utils.read_image(image_file.read()) + self._rgb_image = tf.convert_to_tensor(np.array(rgb_image)) + label_path = self._test_gt_data_dir + 'dummy_gt_for_vps.png' + with tf.io.gfile.GFile(label_path, 'rb') as label_file: + label = data_utils.read_image(label_file.read()) + self._label = tf.expand_dims(tf.convert_to_tensor( + np.dot(np.array(label), [1, 256, 256 * 256])), -1) + + def test_input_generator(self): + tf.random.set_seed(0) + np.random.seed(0) + small_instances = {'threshold': 4096, 'weight': 3.0} + generator = sample_generator.PanopticSampleGenerator( + dataset.CITYSCAPES_PANOPTIC_INFORMATION._asdict(), + focus_small_instances=small_instances, + is_training=True, + crop_size=[769, 769], + thing_id_mask_annotations=True) + input_sample = { + 'image': self._rgb_image, + 'image_name': 'test_image', + 'label': self._label, + 'height': 800, + 'width': 800 + } + sample = generator(input_sample) + + self.assertIn(common.IMAGE, sample) + self.assertIn(common.GT_SEMANTIC_KEY, sample) + self.assertIn(common.GT_PANOPTIC_KEY, sample) + self.assertIn(common.GT_INSTANCE_CENTER_KEY, sample) + self.assertIn(common.GT_INSTANCE_REGRESSION_KEY, sample) + self.assertIn(common.GT_IS_CROWD, sample) + self.assertIn(common.GT_THING_ID_MASK_KEY, sample) + self.assertIn(common.GT_THING_ID_CLASS_KEY, sample) + self.assertIn(common.SEMANTIC_LOSS_WEIGHT_KEY, sample) + self.assertIn(common.CENTER_LOSS_WEIGHT_KEY, sample) + self.assertIn(common.REGRESSION_LOSS_WEIGHT_KEY, sample) + + self.assertListEqual(sample[common.IMAGE].shape.as_list(), [769, 769, 3]) + self.assertListEqual(sample[common.GT_SEMANTIC_KEY].shape.as_list(), + [769, 769]) + self.assertListEqual(sample[common.GT_PANOPTIC_KEY].shape.as_list(), + [769, 769]) + self.assertListEqual(sample[common.GT_INSTANCE_CENTER_KEY].shape.as_list(), + [769, 769]) + self.assertListEqual( + sample[common.GT_INSTANCE_REGRESSION_KEY].shape.as_list(), + [769, 769, 2]) + self.assertListEqual(sample[common.GT_IS_CROWD].shape.as_list(), [769, 769]) + self.assertListEqual(sample[common.GT_THING_ID_MASK_KEY].shape.as_list(), + [769, 769]) + self.assertListEqual(sample[common.GT_THING_ID_CLASS_KEY].shape.as_list(), + [128]) + self.assertListEqual( + sample[common.SEMANTIC_LOSS_WEIGHT_KEY].shape.as_list(), [769, 769]) + self.assertListEqual(sample[common.CENTER_LOSS_WEIGHT_KEY].shape.as_list(), + [769, 769]) + self.assertListEqual( + sample[common.REGRESSION_LOSS_WEIGHT_KEY].shape.as_list(), + [769, 769]) + + gt_sem = sample[common.GT_SEMANTIC_KEY] + gt_pan = sample[common.GT_PANOPTIC_KEY] + gt_center = tf.cast(sample[common.GT_INSTANCE_CENTER_KEY] * 255, tf.uint8) + gt_is_crowd = sample[common.GT_IS_CROWD] + gt_thing_id_mask = sample[common.GT_THING_ID_MASK_KEY] + gt_thing_id_class = sample[common.GT_THING_ID_CLASS_KEY] + image = tf.cast(sample[common.IMAGE], tf.uint8) + + # semantic weights can be in range of [0, 3] in this example. + semantic_weights = tf.cast(sample[common.SEMANTIC_LOSS_WEIGHT_KEY] * 85, + tf.uint8) + center_weights = tf.cast(sample[common.CENTER_LOSS_WEIGHT_KEY] * 255, + tf.uint8) + offset_weights = tf.cast(sample[common.REGRESSION_LOSS_WEIGHT_KEY] * 255, + tf.uint8) + + np.testing.assert_almost_equal( + image.numpy(), + _get_groundtruth_image( + image, + self._test_target_data_dir + 'rgb_target.png')) + np.testing.assert_almost_equal( + gt_sem.numpy(), + _get_groundtruth_image( + gt_sem, + self._test_target_data_dir + 'semantic_target.png')) + # Save gt as png. Pillow is currently unable to correctly save the image as + # 32bit, but uses 16bit which overflows. + _ = _get_groundtruth_image( + gt_pan, self._test_target_data_dir + 'panoptic_target.png') + np.testing.assert_almost_equal( + gt_pan.numpy(), + _get_groundtruth_array( + gt_pan, + self._test_target_data_dir + 'panoptic_target.npy')) + np.testing.assert_almost_equal( + gt_thing_id_mask.numpy(), + _get_groundtruth_array( + gt_thing_id_mask, + self._test_target_data_dir + 'thing_id_mask_target.npy')) + np.testing.assert_almost_equal( + gt_thing_id_class.numpy(), + _get_groundtruth_array( + gt_thing_id_class, + self._test_target_data_dir + 'thing_id_class_target.npy')) + np.testing.assert_almost_equal( + gt_center.numpy(), + _get_groundtruth_image( + gt_center, + self._test_target_data_dir + 'center_target.png')) + np.testing.assert_almost_equal( + sample[common.GT_INSTANCE_REGRESSION_KEY].numpy(), + _get_groundtruth_array( + sample[common.GT_INSTANCE_REGRESSION_KEY].numpy(), + self._test_target_data_dir + 'offset_target.npy')) + np.testing.assert_array_equal( + gt_is_crowd.numpy(), + _get_groundtruth_array(gt_is_crowd.numpy(), + self._test_target_data_dir + 'is_crowd.npy')) + np.testing.assert_almost_equal( + semantic_weights.numpy(), + _get_groundtruth_image( + semantic_weights, + self._test_target_data_dir + 'semantic_weights.png')) + np.testing.assert_almost_equal( + center_weights.numpy(), + _get_groundtruth_image( + center_weights, + self._test_target_data_dir + 'center_weights.png')) + np.testing.assert_almost_equal( + offset_weights.numpy(), + _get_groundtruth_image( + offset_weights, + self._test_target_data_dir + 'offset_weights.png')) + + def test_input_generator_eval(self): + tf.random.set_seed(0) + np.random.seed(0) + small_instances = {'threshold': 4096, 'weight': 3.0} + generator = sample_generator.PanopticSampleGenerator( + dataset.CITYSCAPES_PANOPTIC_INFORMATION._asdict(), + focus_small_instances=small_instances, + is_training=False, + crop_size=[800, 800]) + input_sample = { + 'image': self._rgb_image, + 'image_name': 'test_image', + 'label': self._label, + 'height': 800, + 'width': 800 + } + sample = generator(input_sample) + + self.assertIn(common.GT_SEMANTIC_RAW, sample) + self.assertIn(common.GT_PANOPTIC_RAW, sample) + self.assertIn(common.GT_IS_CROWD_RAW, sample) + + gt_sem_raw = sample[common.GT_SEMANTIC_RAW] + gt_pan_raw = sample[common.GT_PANOPTIC_RAW] + gt_is_crowd_raw = sample[common.GT_IS_CROWD_RAW] + + self.assertListEqual(gt_sem_raw.shape.as_list(), [800, 800]) + self.assertListEqual(gt_pan_raw.shape.as_list(), [800, 800]) + self.assertListEqual(gt_is_crowd_raw.shape.as_list(), [800, 800]) + + np.testing.assert_almost_equal( + gt_sem_raw.numpy(), + _get_groundtruth_image( + gt_sem_raw, + self._test_target_data_dir + 'eval_semantic_target.png')) + np.testing.assert_almost_equal( + gt_pan_raw.numpy(), + _get_groundtruth_array( + gt_pan_raw, + self._test_target_data_dir + 'eval_panoptic_target.npy')) + np.testing.assert_almost_equal( + gt_is_crowd_raw.numpy(), + _get_groundtruth_array(gt_is_crowd_raw, self._test_target_data_dir + + 'eval_is_crowd.npy')) + + +if __name__ == '__main__': + tf.test.main() diff --git a/data/testdata/create_test_data.py b/data/testdata/create_test_data.py new file mode 100644 index 0000000000000000000000000000000000000000..3e4e06d5b2e3a87943c1cb3f54d490d7588551cb --- /dev/null +++ b/data/testdata/create_test_data.py @@ -0,0 +1,205 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Script to generate test data for cityscapes.""" + +import collections +import json +import os + +from absl import app +from absl import flags +from absl import logging +import numpy as np +from PIL import Image +import tensorflow as tf + +# resources dependency + +from deeplab2.data import data_utils +from deeplab2.data import dataset + +flags.DEFINE_string( + 'panoptic_annotation_path', + 'deeplab2/data/testdata/' + 'dummy_prediction.png', + 'Path to annotated test image with cityscapes encoding.') +flags.DEFINE_string( + 'panoptic_gt_output_path', + 'deeplab2/data/testdata/' + 'dummy_gt_for_vps.png', + 'Path to annotated test image with Video Panoptic Segmentation encoding.') +flags.DEFINE_string( + 'output_cityscapes_root', + 'deeplab2/data/testdata/', + 'Path to output root directory.') + +FLAGS = flags.FLAGS + +# Cityscapes label, using `TrainId`. +_CITYSCAPES_IGNORE = 255 +# Each valid (not ignored) label below is a tuple of (TrainId, EvalId) +_CITYSCAPES_CAR = (13, 26) +_CITYSCAPES_TREE = (8, 21) +_CITYSCAPES_SKY = (10, 23) +_CITYSCAPES_BUILDING = (2, 11) +_CITYSCAPES_ROAD = (0, 7) + +_IS_CROWD = 'is_crowd' +_NOT_CROWD = 'not_crowd' + +_CLASS_HAS_INSTANCES_LIST = dataset.CITYSCAPES_PANOPTIC_INFORMATION.class_has_instances_list +_PANOPTIC_LABEL_DIVISOR = dataset.CITYSCAPES_PANOPTIC_INFORMATION.panoptic_label_divisor +_FILENAME_PREFIX = 'dummy_000000_000000' + + +def create_test_data(annotation_path): + """Creates cityscapes panoptic annotation, vps annotation and segment info. + + Our Video Panoptic Segmentation (VPS) encoding uses ID == semantic trainID * + 1000 + instance ID (starting at 1) with instance ID == 0 marking + crowd regions. + + Args: + annotation_path: The path to the annotation to be loaded. + + Returns: + A tuple of cityscape annotation, vps annotation and segment infos. + """ + # Convert panoptic labels to cityscapes label format. + + # Dictionary mapping converted panoptic annotation to its corresponding + # Cityscapes label. Here the key is encoded by converting each RGB pixel + # value to 1 * R + 256 * G + 256 * 256 * B. + panoptic_label_to_cityscapes_label = { + 0: (_CITYSCAPES_IGNORE, _NOT_CROWD), + 31110: (_CITYSCAPES_CAR, _NOT_CROWD), + 31354: (_CITYSCAPES_CAR, _IS_CROWD), + 35173: (_CITYSCAPES_CAR, _NOT_CROWD), + 488314: (_CITYSCAPES_CAR, _IS_CROWD), + 549788: (_CITYSCAPES_CAR, _IS_CROWD), + 1079689: (_CITYSCAPES_CAR, _IS_CROWD), + 1341301: (_CITYSCAPES_CAR, _NOT_CROWD), + 1544590: (_CITYSCAPES_CAR, _NOT_CROWD), + 1926498: (_CITYSCAPES_CAR, _NOT_CROWD), + 4218944: (_CITYSCAPES_TREE, _NOT_CROWD), + 4251840: (_CITYSCAPES_SKY, _NOT_CROWD), + 6959003: (_CITYSCAPES_BUILDING, _NOT_CROWD), + # To be merged with the building segment above. + 8396960: (_CITYSCAPES_BUILDING, _NOT_CROWD), + 8413312: (_CITYSCAPES_ROAD, _NOT_CROWD), + } + with tf.io.gfile.GFile(annotation_path, 'rb') as f: + panoptic = data_utils.read_image(f.read()) + + # Input panoptic annotation is RGB color coded, here we convert each pixel + # to a unique number to avoid comparing 3-tuples. + panoptic = np.dot(panoptic, [1, 256, 256 * 256]) + # Creates cityscapes panoptic map. Cityscapes use ID == semantic EvalId for + # `stuff` segments and `thing` segments with `iscrowd` label, and + # ID == semantic EvalId * 1000 + instance ID (starting from 0) for other + # `thing` segments. + cityscapes_panoptic = np.zeros_like(panoptic, dtype=np.int32) + # Creates Video Panoptic Segmentation (VPS) map. We use ID == semantic + # trainID * 1000 + instance ID (starting at 1) with instance ID == 0 marking + # crowd regions. + vps_panoptic = np.zeros_like(panoptic, dtype=np.int32) + num_instances_per_class = collections.defaultdict(int) + unique_labels = np.unique(panoptic) + + # Dictionary that maps segment id to segment info. + segments_info = {} + for label in unique_labels: + cityscapes_label, is_crowd = panoptic_label_to_cityscapes_label[label] + selected_pixels = panoptic == label + + if cityscapes_label == _CITYSCAPES_IGNORE: + vps_panoptic[selected_pixels] = ( + _CITYSCAPES_IGNORE * _PANOPTIC_LABEL_DIVISOR) + continue + + train_id, eval_id = tuple(cityscapes_label) + cityscapes_id = eval_id + vps_id = train_id * _PANOPTIC_LABEL_DIVISOR + if train_id in _CLASS_HAS_INSTANCES_LIST: + # `thing` class. + if is_crowd != _IS_CROWD: + cityscapes_id = ( + eval_id * _PANOPTIC_LABEL_DIVISOR + + num_instances_per_class[train_id]) + # First instance should have ID 1. + vps_id += num_instances_per_class[train_id] + 1 + num_instances_per_class[train_id] += 1 + + cityscapes_panoptic[selected_pixels] = cityscapes_id + vps_panoptic[selected_pixels] = vps_id + pixel_area = int(np.sum(selected_pixels)) + if cityscapes_id in segments_info: + logging.info('Merging segments with label %d into segment %d', label, + cityscapes_id) + segments_info[cityscapes_id]['area'] += pixel_area + else: + segments_info[cityscapes_id] = { + 'area': pixel_area, + 'category_id': train_id, + 'id': cityscapes_id, + 'iscrowd': 1 if is_crowd == _IS_CROWD else 0, + } + + cityscapes_panoptic = np.dstack([ + cityscapes_panoptic % 256, cityscapes_panoptic // 256, + cityscapes_panoptic // 256 // 256 + ]) + vps_panoptic = np.dstack( + [vps_panoptic % 256, vps_panoptic // 256, vps_panoptic // 256 // 256]) + return (cityscapes_panoptic.astype(np.uint8), vps_panoptic.astype(np.uint8), + list(segments_info.values())) + + +def main(argv): + if len(argv) > 1: + raise app.UsageError('Too many command-line arguments.') + + data_path = FLAGS.panoptic_annotation_path # OSS: removed internal filename loading. + panoptic_map, vps_map, segments_info = create_test_data(data_path) + panoptic_map_filename = _FILENAME_PREFIX + '_gtFine_panoptic.png' + panoptic_map_path = os.path.join(FLAGS.output_cityscapes_root, 'gtFine', + 'cityscapes_panoptic_dummy_trainId', + panoptic_map_filename) + + gt_output_path = FLAGS.panoptic_gt_output_path # OSS: removed internal filename loading. + with tf.io.gfile.GFile(gt_output_path, 'wb') as f: + Image.fromarray(vps_map).save(f, format='png') + + panoptic_map_path = panoptic_map_path # OSS: removed internal filename loading. + with tf.io.gfile.GFile(panoptic_map_path, 'wb') as f: + Image.fromarray(panoptic_map).save(f, format='png') + + json_annotation = { + 'annotations': [{ + 'file_name': _FILENAME_PREFIX + '_gtFine_panoptic.png', + 'image_id': _FILENAME_PREFIX, + 'segments_info': segments_info + }] + } + json_annotation_path = os.path.join(FLAGS.output_cityscapes_root, 'gtFine', + 'cityscapes_panoptic_dummy_trainId.json') + json_annotation_path = json_annotation_path # OSS: removed internal filename loading. + with tf.io.gfile.GFile(json_annotation_path, 'w') as f: + json.dump(json_annotation, f, indent=2) + + +if __name__ == '__main__': + app.run(main) diff --git a/data/testdata/dummy_gt_for_vps.png b/data/testdata/dummy_gt_for_vps.png new file mode 100644 index 0000000000000000000000000000000000000000..e943d1f5704d7d9db8ad0a6c402b6d2eca61ab3a Binary files /dev/null and b/data/testdata/dummy_gt_for_vps.png differ diff --git a/data/testdata/dummy_prediction.png b/data/testdata/dummy_prediction.png new file mode 100644 index 0000000000000000000000000000000000000000..f0b979eb87d103f5b11e548cdbeab9fa11e57d34 Binary files /dev/null and b/data/testdata/dummy_prediction.png differ diff --git a/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId.json b/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId.json new file mode 100644 index 0000000000000000000000000000000000000000..8465f987d8d75a152d5aa85b12514eeb68362448 --- /dev/null +++ b/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId.json @@ -0,0 +1,70 @@ +{ + "annotations": [ + { + "file_name": "dummy_000000_000000_gtFine_panoptic.png", + "image_id": "dummy_000000_000000", + "segments_info": [ + { + "area": 958, + "category_id": 13, + "id": 26000, + "iscrowd": 0 + }, + { + "area": 6178, + "category_id": 13, + "id": 26, + "iscrowd": 1 + }, + { + "area": 10496, + "category_id": 13, + "id": 26001, + "iscrowd": 0 + }, + { + "area": 5534, + "category_id": 13, + "id": 26002, + "iscrowd": 0 + }, + { + "area": 32768, + "category_id": 13, + "id": 26003, + "iscrowd": 0 + }, + { + "area": 19906, + "category_id": 13, + "id": 26004, + "iscrowd": 0 + }, + { + "area": 15940, + "category_id": 8, + "id": 21, + "iscrowd": 0 + }, + { + "area": 278754, + "category_id": 10, + "id": 23, + "iscrowd": 0 + }, + { + "area": 222420, + "category_id": 2, + "id": 11, + "iscrowd": 0 + }, + { + "area": 46475, + "category_id": 0, + "id": 7, + "iscrowd": 0 + } + ] + } + ] +} \ No newline at end of file diff --git a/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId/dummy_000000_000000_gtFine_panoptic.png b/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId/dummy_000000_000000_gtFine_panoptic.png new file mode 100644 index 0000000000000000000000000000000000000000..61fe7ba373f44768e652d9b48386e4299172e755 Binary files /dev/null and b/data/testdata/gtFine/cityscapes_panoptic_dummy_trainId/dummy_000000_000000_gtFine_panoptic.png differ diff --git a/data/testdata/leftImg8bit/dummy_000000_000000_leftImg8bit.png b/data/testdata/leftImg8bit/dummy_000000_000000_leftImg8bit.png new file mode 100644 index 0000000000000000000000000000000000000000..a1d4a6eedb8d36bfa265627563107eec8c1cda8c Binary files /dev/null and b/data/testdata/leftImg8bit/dummy_000000_000000_leftImg8bit.png differ diff --git a/data/testdata/targets/center_target.png b/data/testdata/targets/center_target.png new file mode 100644 index 0000000000000000000000000000000000000000..8310b59d0ae5526ee39d54fb63416a436b760038 Binary files /dev/null and b/data/testdata/targets/center_target.png differ diff --git a/data/testdata/targets/center_weights.png b/data/testdata/targets/center_weights.png new file mode 100644 index 0000000000000000000000000000000000000000..2c985f4f1677dce8e2ef26e6c468bd499fff4ae7 Binary files /dev/null and b/data/testdata/targets/center_weights.png differ diff --git a/data/testdata/targets/eval_is_crowd.npy b/data/testdata/targets/eval_is_crowd.npy new file mode 100644 index 0000000000000000000000000000000000000000..b0b544bd0bbe20ae7632b92f6e8ee75e6093eb76 Binary files /dev/null and b/data/testdata/targets/eval_is_crowd.npy differ diff --git a/data/testdata/targets/eval_panoptic_target.npy b/data/testdata/targets/eval_panoptic_target.npy new file mode 100644 index 0000000000000000000000000000000000000000..1dce1427996fb8ad7bac8d3013481548b951284a Binary files /dev/null and b/data/testdata/targets/eval_panoptic_target.npy differ diff --git a/data/testdata/targets/eval_semantic_target.png b/data/testdata/targets/eval_semantic_target.png new file mode 100644 index 0000000000000000000000000000000000000000..60214bbe0e7696852adccf56cb9edef098d2eb40 Binary files /dev/null and b/data/testdata/targets/eval_semantic_target.png differ diff --git a/data/testdata/targets/is_crowd.npy b/data/testdata/targets/is_crowd.npy new file mode 100644 index 0000000000000000000000000000000000000000..24130fc708dccedff42626f8b51908ffc54bc00c Binary files /dev/null and b/data/testdata/targets/is_crowd.npy differ diff --git a/data/testdata/targets/offset_target.npy b/data/testdata/targets/offset_target.npy new file mode 100644 index 0000000000000000000000000000000000000000..c993faf4c15382ff9f2ac2c40165d8fcdeb65f35 Binary files /dev/null and b/data/testdata/targets/offset_target.npy differ diff --git a/data/testdata/targets/offset_weights.png b/data/testdata/targets/offset_weights.png new file mode 100644 index 0000000000000000000000000000000000000000..7918ce04969500719d628e0151321ecda4ff4d8f Binary files /dev/null and b/data/testdata/targets/offset_weights.png differ diff --git a/data/testdata/targets/panoptic_target.npy b/data/testdata/targets/panoptic_target.npy new file mode 100644 index 0000000000000000000000000000000000000000..5e8831e96bf685fb1a83474e8e8810c551f56cbf Binary files /dev/null and b/data/testdata/targets/panoptic_target.npy differ diff --git a/data/testdata/targets/panoptic_target.png b/data/testdata/targets/panoptic_target.png new file mode 100644 index 0000000000000000000000000000000000000000..248d57de058c2c756a9464fdebbae1e6fd7fd630 Binary files /dev/null and b/data/testdata/targets/panoptic_target.png differ diff --git a/data/testdata/targets/rgb_target.png b/data/testdata/targets/rgb_target.png new file mode 100644 index 0000000000000000000000000000000000000000..3da0a683ba406cd16a6b4c3f8fd5e21f4a9d8e11 Binary files /dev/null and b/data/testdata/targets/rgb_target.png differ diff --git a/data/testdata/targets/semantic_target.png b/data/testdata/targets/semantic_target.png new file mode 100644 index 0000000000000000000000000000000000000000..1100d7764ceb200c413cd8b42a6fdd18692c0371 Binary files /dev/null and b/data/testdata/targets/semantic_target.png differ diff --git a/data/testdata/targets/semantic_weights.png b/data/testdata/targets/semantic_weights.png new file mode 100644 index 0000000000000000000000000000000000000000..29b970f31b0a4bb253209225d44cc618532ab261 Binary files /dev/null and b/data/testdata/targets/semantic_weights.png differ diff --git a/data/testdata/targets/thing_id_class_target.npy b/data/testdata/targets/thing_id_class_target.npy new file mode 100644 index 0000000000000000000000000000000000000000..6e50e7ebeab996fcd0194798e7efa1cb63a6e062 Binary files /dev/null and b/data/testdata/targets/thing_id_class_target.npy differ diff --git a/data/testdata/targets/thing_id_mask_target.npy b/data/testdata/targets/thing_id_mask_target.npy new file mode 100644 index 0000000000000000000000000000000000000000..28c058c8ef020aeb574e649cc8a99afb8c06d867 Binary files /dev/null and b/data/testdata/targets/thing_id_mask_target.npy differ diff --git a/data/utils/__init__.py b/data/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03 --- /dev/null +++ b/data/utils/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/data/utils/create_step_panoptic_maps.py b/data/utils/create_step_panoptic_maps.py new file mode 100644 index 0000000000000000000000000000000000000000..77dd710a6861c858fd4b4ad3dc5f9eba5f912678 --- /dev/null +++ b/data/utils/create_step_panoptic_maps.py @@ -0,0 +1,305 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Creates STEP panoptic map from semantic and instance maps. + +This script implements the process of merging semantic maps (from our extra +annotations[1]) and instance maps (collected from the MOTS[2]) to obtain the +STEP panoptic map. + +[1] Mark Weber, etc. STEP: Segmenting and Tracking Every Pixel, arXiv:2102.11859 +[2] Paul Voigtlaender, etc. Multi-object tracking and segmentation. CVPR, 2019 + +To run this script, you need to install opencv-python (>=4.4.0). +e.g. In Linux, run +$pip install opencv-python + +The input directory structure should be as follows: + ++ INPUT_SEMANTIC_MAP_ROOT_DIR + + train + + sequence_id + - *.png + ... + + val + ++ INPUT_INSTANCE_MAP_ROOT_DIR + + train + + sequence_id + - *.png + ... + + val + ++ OUTPUT_PANOPTIC_MAP_ROOT_DIR (generated) + + train + + sequence_id + - *.png + ... + + val + +The ground-truth panoptic map is generated and encoded as the following in PNG +format: + R: semantic_id + G: instance_id // 256 + B: instance % 256 + +The generated panoptic maps will be used by ../build_step_data.py to create +tfrecords for training and evaluation. + +Example to run the scipt: + +```bash + python deeplab2/data/utils/create_step_panoptic_maps.py \ + --input_semantic_map_root_dir=... + ... +``` +""" + +import os +from typing import Any, Sequence, Union + +from absl import app +from absl import flags +from absl import logging +import cv2 +import numpy as np +from PIL import Image +import tensorflow as tf + +FLAGS = flags.FLAGS +flags.DEFINE_string('input_semantic_map_root_dir', None, + 'Path to a directory containing the semantic map.') +flags.DEFINE_string('input_instance_root_dir', None, + 'Path to a directory containing the instance map.') +flags.DEFINE_string('output_panoptic_map_root_dir', None, + 'Path to a directory where we write the panoptic map.') +flags.DEFINE_integer( + 'kernel_size', 15, 'Kernel size to extend instance object boundary when ' + 'merging it with semantic map.') +flags.DEFINE_enum('dataset_name', 'kitti-step', + ['kitti-step', 'motchallenge-step'], 'Name of the dataset') + +# The label definition below follows Cityscapes label definition in +# https://www.cityscapes-dataset.com/. +MOTCHALLENGE_MERGED_CLASSES = (0, 3, 4, 5, 6, 7, 9, 13, 14, 15, 16, 17) +NUM_VALID_CLASSES = 19 +SEMANTIC_CAR = 13 +SEMANTIC_PERSON = 11 +SEMANTIC_VOID = 255 +INSTANCE_CAR = 1 +INSTANCE_PERSON = 2 +INSTANCE_LABEL_DIVISOR = 1000 + + +def encode_panoptic_map(panoptic_map: np.ndarray) -> np.ndarray: + """Encodes the panoptic map in three channel image format.""" + # Encoding format: R: semantic | G: instance // 256 | B: instance % 256 + semantic_id = panoptic_map // INSTANCE_LABEL_DIVISOR + instance_id = panoptic_map % INSTANCE_LABEL_DIVISOR + return np.dstack( + (semantic_id, instance_id // 256, instance_id % 256)).astype(np.uint8) + + +def load_image(image_path: str) -> np.ndarray: + """Loads an image as numpy array.""" + with tf.io.gfile.GFile(image_path, 'rb') as f: + return np.array(Image.open(f)) + + +def _update_motchallege_label_map(semantic_map: np.ndarray) -> np.ndarray: + """Updates semantic map by merging some classes.""" + # For MOTChallenge dataset, we merge some classes since they are less + # representative: + #-------------------------------------------------------------- + # Original index | Updated index| Note + #----------------+--------------+------------------------------ + # 0 | 1 | map road to sidewalk + # 1 | 1 | keep sidewalk + # 2 | 2 | keep building + # 3 | 255 | not present anyway + # 4 | 255 | remove fence + # 5 | 255 | remove pole + # 6 | 255 | remove traffic light + # 7 | 255 | not present anyway + # 8 | 8 | keep vegetation + # 9 | 8 | map terrain to vegetation + # 10 | 10 | keep sky + # 11 | 11 | keep pedestrain + # 12 | 12 | keep rider + # 13 | 255 | remove car + # 14 | 255 | not present anyway + # 15 | 255 | not present anyway + # 16 | 255 | not present anyway + # 17 | 255 | remove motorcycle + # 18 | 18 | keep bicycle + # 255 | 255 | keep void + #-------------------------------------------------------------- + for label in MOTCHALLENGE_MERGED_CLASSES: + if label == 0: + semantic_map[semantic_map == label] = 1 + elif label == 9: + semantic_map[semantic_map == label] = 8 + else: + semantic_map[semantic_map == label] = 255 + return semantic_map + + +def _compute_panoptic_id(semantic_id: Union[int, np.ndarray], + instance_id: Union[int, np.ndarray]) -> Any: + """Gets the panoptic id by combining semantic and instance id.""" + return semantic_id * INSTANCE_LABEL_DIVISOR + instance_id + + +def _remap_motchallege_semantic_indices(panoptic_id: np.ndarray) -> np.ndarray: + """Updates MOTChallenge semantic map by re-mapping label indices.""" + semantic_id = panoptic_id // INSTANCE_LABEL_DIVISOR + instance_id = panoptic_id % INSTANCE_LABEL_DIVISOR + # Re-mapping index + # 1 -> 0: sidewalk + # 2 -> 1: building + # 8 -> 2: vegetation + # 10 -> 3: sky + # 11 -> 4: pedestrain + # 12 -> 5: rider + # 18 -> 6: bicycle + # 255 -> 255: void + all_labels = set(range(NUM_VALID_CLASSES)) + for i, label in enumerate( + sorted(all_labels - set(MOTCHALLENGE_MERGED_CLASSES))): + semantic_id[semantic_id == label] = i + return _compute_panoptic_id(semantic_id, instance_id) + + +def _get_semantic_maps(semantic_map_root: str, dataset_split: str, + sequence_id: str) -> Sequence[str]: + """Gets files for the specified data type and dataset split.""" + search_files = os.path.join(semantic_map_root, dataset_split, sequence_id, + '*') + filenames = tf.io.gfile.glob(search_files) + return sorted(filenames) + + +class StepPanopticMapGenerator(object): + """Class to generate and write panoptic map from semantic and instance map.""" + + def __init__(self, kernel_size: int, dataset_name: str): + self.kernel_size = kernel_size + self.is_mots_challenge = (dataset_name == 'motchallenge-step') + + def _update_semantic_label_map(self, instance_map: np.ndarray, + semantic_map: np.ndarray) -> np.ndarray: + """Updates semantic map by leveraging semantic map and instance map.""" + kernel = np.ones((self.kernel_size, self.kernel_size), np.uint8) + updated_semantic_map = semantic_map.astype(np.int32) + if self.is_mots_challenge: + updated_semantic_map = _update_motchallege_label_map(updated_semantic_map) + for label in (SEMANTIC_CAR, SEMANTIC_PERSON): + semantic_mask = (semantic_map == label) + if label == SEMANTIC_PERSON: + # The instance ids are encoded according to + # https://www.vision.rwth-aachen.de/page/mots + instance_mask = ( + instance_map // INSTANCE_LABEL_DIVISOR == INSTANCE_PERSON) + elif label == SEMANTIC_CAR: + instance_mask = instance_map // INSTANCE_LABEL_DIVISOR == INSTANCE_CAR + # Run dilation on the instance map to merge it with semantic map. + instance_mask = instance_mask.astype(np.uint8) + dilated_instance_mask = cv2.dilate(instance_mask, kernel) + void_boundary = np.logical_and(dilated_instance_mask - instance_mask, + semantic_mask) + updated_semantic_map[void_boundary] = SEMANTIC_VOID + return updated_semantic_map + + def merge_panoptic_map(self, semantic_map: np.ndarray, + instance_map: np.ndarray) -> np.ndarray: + """Merges semantic labels with given instance map.""" + # Use semantic_map as the base map. + updated_semantic_map = self._update_semantic_label_map( + instance_map, semantic_map) + panoptic_map = _compute_panoptic_id(updated_semantic_map, 0) + # Merge instance. + mask_car = instance_map // INSTANCE_LABEL_DIVISOR == INSTANCE_CAR + # The instance map has index from 0 but the panoptic map's instance index + # will start from 1. + instance_id = (instance_map[mask_car] % INSTANCE_LABEL_DIVISOR) + 1 + panoptic_map[mask_car] = _compute_panoptic_id(SEMANTIC_CAR, + instance_id.astype(np.int32)) + mask_person = instance_map // INSTANCE_LABEL_DIVISOR == INSTANCE_PERSON + instance_id = (instance_map[mask_person] % INSTANCE_LABEL_DIVISOR) + 1 + panoptic_map[mask_person] = _compute_panoptic_id( + SEMANTIC_PERSON, instance_id.astype(np.int32)) + + # Remap label indices. + if self.is_mots_challenge: + panoptic_map = _remap_motchallege_semantic_indices(panoptic_map) + return panoptic_map + + def build_panoptic_maps(self, semantic_map_root: str, instance_map_root: str, + dataset_split: str, sequence_id: str, + panoptic_map_root: str): + """Creates panoptic maps and save them as PNG format. + + Args: + semantic_map_root: Semantic map root folder. + instance_map_root: Instance map root folder. + dataset_split: Train/Val/Test split of the data. + sequence_id: Sequence id of the data. + panoptic_map_root: Panoptic map root folder where the encoded panoptic + maps will be saved. + """ + semantic_maps = _get_semantic_maps(semantic_map_root, dataset_split, + sequence_id) + for semantic_map_path in semantic_maps: + image_name = os.path.basename(semantic_map_path) + instance_map_path = os.path.join(instance_map_root, dataset_split, + sequence_id, image_name) + if not tf.io.gfile.exists(instance_map_path): + logging.warn('Could not find instance map for %s', semantic_map_path) + continue + semantic_map = load_image(semantic_map_path) + instance_map = load_image(instance_map_path) + panoptic_map = self.merge_panoptic_map(semantic_map, instance_map) + encoded_panoptic_map = Image.fromarray( + encode_panoptic_map(panoptic_map)).convert('RGB') + panoptic_map_path = os.path.join(panoptic_map_root, dataset_split, + sequence_id, image_name) + with tf.io.gfile.GFile(panoptic_map_path, 'wb') as f: + encoded_panoptic_map.save(f, format='PNG') + + +def main(argv: Sequence[str]) -> None: + if len(argv) > 1: + raise app.UsageError('Too many command-line arguments.') + + panoptic_map_generator = StepPanopticMapGenerator(FLAGS.kernel_size, + FLAGS.dataset_name) + for dataset_split in ('train', 'val', 'test'): + sem_dir = os.path.join(FLAGS.input_semantic_map_root_dir, dataset_split) + if not tf.io.gfile.exists(sem_dir): + logging.info('Split %s not found.', dataset_split) + continue + for set_dir in tf.io.gfile.listdir(sem_dir): + tf.io.gfile.makedirs( + os.path.join(FLAGS.output_panoptic_map_root_dir, dataset_split, + set_dir)) + logging.info('Start to create panoptic map for split %s, sequence %s.', + dataset_split, set_dir) + panoptic_map_generator.build_panoptic_maps( + FLAGS.input_semantic_map_root_dir, FLAGS.input_instance_root_dir, + dataset_split, set_dir, FLAGS.output_panoptic_map_root_dir) + + +if __name__ == '__main__': + app.run(main) diff --git a/dataset.proto b/dataset.proto new file mode 100644 index 0000000000000000000000000000000000000000..b33263aba4e3c5d06f7699e75c41ed8b52d263c3 --- /dev/null +++ b/dataset.proto @@ -0,0 +1,88 @@ +// Copyright 2021 The Deeplab2 Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package deeplab2; + +// Configure the dataset options. +message DatasetOptions { + // Set the dataset. See dataset.py for supported datasets. + optional string dataset = 1; + // Set the dataset file pattern to be used with glob. + repeated string file_pattern = 2; + // Set the number of samples per batch. This must be a multiple of replicas. + // E.g. batch_size = 8 on 4 GPUs equals a batch size of 2 on each GPU. + optional int32 batch_size = 3 [default = 32]; + // Set the crop size as a list of [crop_height, crop_width]. + repeated int32 crop_size = 4; + // Minimum value for resize. Can be 1) empty; or 2) an integer, indicating + // the desired size of the shorter image side (either height or width); or + // 3) a 2-tuple of (height, width), indicating the desired minimum value for + // height and width after resize. Setting values to non-positive indicate + // no minimum value would be used. + repeated int32 min_resize_value = 5; + // Maximum value for resize. Can be 1) empty; or 2) an integer, indicating + // the maximum allowed size of the longer image side (either height or width); + // or 3) a 2-tuple of (height, width), indicating the maximum allowed size + // after resize. Setting values to non-positive indicates no maximum value + // would be used. + repeated int32 max_resize_value = 6; + // Set the resizing factor. + optional int32 resize_factor = 7; + + /* Augmentation options.*/ + message AugmentationOptions { + // Set the minimum scale factor for augmentation. Default not to use. + optional float min_scale_factor = 1 [default = 1.0]; + // Set the maximum scale factor for augmentation. Default not to use. + optional float max_scale_factor = 2 [default = 1.0]; + // Set the scale factor step size for data augmentation. + optional float scale_factor_step_size = 3 [default = 0.25]; + // The name of the AutoAugment policy to use. + optional string autoaugment_policy_name = 4; + } + optional AugmentationOptions augmentations = 8; + // Set the standard deviation used to generate Gaussian center ground-truth. + optional float sigma = 9 [default = 8.0]; + // Set whether to use increased weights on small instances. + optional bool increase_small_instance_weights = 10 [default = false]; + // Set the pixel threshold for small instances. + optional int32 small_instance_threshold = 11 [default = 4096]; + // Set the small instance weight. + optional float small_instance_weight = 12 [default = 3.0]; + // Set whether to use two frames togetehr (current frame + previous frame) as + // input for video panoptic segmentation. + optional bool use_two_frames = 13 [default = false]; + // Whether to decode the groundtruth label. Some dataset splits (e.g., test + // set) may not contain any groundtruth label. In that case, set this field + // to false to avoid decoding non-existing groundtruth label. + optional bool decode_groundtruth_label = 14 [default = true]; + // Whether the model needs thing_id_mask annotations. When True, we will + // additionally return mask annotation for each `thing` instance, encoded with + // a unique thing_id. This ground-truth annotation could be used to learn a + // better segmentation mask for each instance. `thing_id` indicates the number + // of unique thing-ID to each instance in an image, starting the counting from + // 0 (default: False). + optional bool thing_id_mask_annotations = 15 [default = false]; + // Set the maximum number of possible thing instances per image. It is used + // together when enabling generation of thing_id_mask_annotations (= True), + // representing the maximum thing ID encoded in the thing_id_mask. + optional int32 max_thing_id = 16 [default = 128]; + // Set whether to use the next frame together with the current frame for video + // panoptic segmentation (VPS). This field also controls using two-frame as + // input for VPS. Note that `use_two_frames` is adopted in Motion-DeepLab, + // while `use_next_frame` is used in ViP-DeepLab. + optional bool use_next_frame = 17 [default = false]; +} diff --git a/evaluation/.DS_Store b/evaluation/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/evaluation/.DS_Store differ diff --git a/evaluation/__init__.py b/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03 --- /dev/null +++ b/evaluation/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/evaluation/coco_instance_ap.py b/evaluation/coco_instance_ap.py new file mode 100644 index 0000000000000000000000000000000000000000..c97d8c02c2e2c683d4df9f47c5510de8bee7347c --- /dev/null +++ b/evaluation/coco_instance_ap.py @@ -0,0 +1,337 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""COCO-style instance segmentation evaluation metrics. + +Implements a Keras interface to COCO API. +COCO API: github.com/cocodataset/cocoapi/ +""" +from typing import Any, Collection, Mapping, Optional + +from absl import logging +import numpy as np +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval +import tensorflow as tf + +from deeplab2.utils import coco_tools +from deeplab2.utils import panoptic_instances + + +def _unwrap_segmentation(seg): + return { + 'size': list(seg['size']), + 'counts': seg['counts'], + } + + +_ANNOTATION_CONVERSION = { + 'bbox': list, + 'segmentation': _unwrap_segmentation, +} + + +def _unwrap_annotation(ann: Mapping[str, Any]) -> Mapping[str, Any]: + """Unwraps the objects in an COCO-style annotation dictionary. + + Logic within the Keras metric class wraps the objects within the ground-truth + and detection annotations in ListWrapper and DictWrapper classes. On the other + hand, the COCO API does strict type checking as part of determining which + branch to use in comparing detections and segmentations. We therefore have + to coerce the types from the wrapper to the built-in types that COCO is + expecting. + + Args: + ann: A COCO-style annotation dictionary that may contain ListWrapper and + DictWrapper objects. + + Returns: + The same annotation information, but with wrappers reduced to built-in + types. + """ + unwrapped_ann = {} + for k in ann: + if k in _ANNOTATION_CONVERSION: + unwrapped_ann[k] = _ANNOTATION_CONVERSION[k](ann[k]) + else: + unwrapped_ann[k] = ann[k] + return unwrapped_ann + + +class InstanceAveragePrecision(tf.keras.metrics.Metric): + """COCO evaluation metric class.""" + + def __init__(self, name: str = 'instance_ap', **kwargs): + """Constructs COCO evaluation class.""" + super(InstanceAveragePrecision, self).__init__(name=name, **kwargs) + self.reset_states() + + def reset_states(self) -> None: + """Reset COCO API object.""" + self.detections = [] + self.dataset = { + 'images': [], + 'annotations': [], + 'categories': [] + } + self.image_id = 1 + self.next_groundtruth_annotation_id = 1 + self.category_ids = set() + self.metric_values = None + + def evaluate(self) -> np.ndarray: + """Evaluates with detections from all images with COCO API. + + Returns: + coco_metric: float numpy array with shape [12] representing the + coco-style evaluation metrics. + """ + self.dataset['categories'] = [{ + 'id': int(category_id) + } for category_id in self.category_ids] + + # Creates "unwrapped" copies of COCO json-style objects. + dataset = { + 'images': self.dataset['images'], + 'categories': self.dataset['categories'] + } + dataset['annotations'] = [ + _unwrap_annotation(ann) for ann in self.dataset['annotations'] + ] + detections = [_unwrap_annotation(ann) for ann in self.detections] + + logging.info('Creating COCO objects for AP eval...') + coco_gt = COCO() + coco_gt.dataset = dataset + coco_gt.createIndex() + + coco_dt = coco_gt.loadRes(detections) + + logging.info('Running COCO evaluation...') + coco_eval = COCOeval(coco_gt, coco_dt, iouType='segm') + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + coco_metrics = coco_eval.stats + return np.array(coco_metrics, dtype=np.float32) + + def result(self) -> np.ndarray: + """Return the instance segmentation metric values, computing them if needed. + + Returns: + A float vector of 12 elements. The meaning of each element is (in order): + + 0. AP @[ IoU=0.50:0.95 | area= all | maxDets=100 ] + 1. AP @[ IoU=0.50 | area= all | maxDets=100 ] + 2. AP @[ IoU=0.75 | area= all | maxDets=100 ] + 3. AP @[ IoU=0.50:0.95 | area= small | maxDets=100 ] + 4. AP @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] + 5. AP @[ IoU=0.50:0.95 | area= large | maxDets=100 ] + 6. AR @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] + 7. AR @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] + 8. AR @[ IoU=0.50:0.95 | area= all | maxDets=100 ] + 9. AR @[ IoU=0.50:0.95 | area= small | maxDets=100 ] + 10. AR @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] + 11, AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ] + + Where: AP = Average Precision + AR = Average Recall + IoU = Intersection over Union. IoU=0.50:0.95 is the average of the + metric over thresholds of 0.5 to 0.95 with increments of 0.05. + + The area thresholds mean that, for those entries, ground truth annotation + with area outside the range is ignored. + small: [0**2, 32**2], + medium: [32**2, 96**2] + large: [96**2, 1e5**2] + """ + if not self.metric_values: + self.metric_values = self.evaluate() + return self.metric_values + + def update_state(self, groundtruth_boxes: tf.Tensor, + groundtruth_classes: tf.Tensor, groundtruth_masks: tf.Tensor, + groundtruth_is_crowd: tf.Tensor, detection_masks: tf.Tensor, + detection_scores: tf.Tensor, + detection_classes: tf.Tensor) -> None: + """Update detection results and groundtruth data. + + Append detection results to self.detections to the aggregate results from + all of the validation set. The groundtruth_data is parsed and added into a + dictionary with the same format as COCO dataset, which can be used for + evaluation. + + Args: + groundtruth_boxes: tensor (float32) with shape [num_gt_annos, 4] + groundtruth_classes: tensor (int) with shape [num_gt_annos] + groundtruth_masks: tensor (uint8) with shape [num_gt_annos, image_height, + image_width] + groundtruth_is_crowd: tensor (bool) with shape [num_gt_annos] + detection_masks: tensor (uint8) with shape [num_detections, image_height, + image_width] + detection_scores: tensor (float32) with shape [num_detections] + detection_classes: tensor (int) with shape [num_detections] + """ + # Reset the caching of result values. + self.metric_values = None + + # Update known category ids. + self.category_ids.update(groundtruth_classes.numpy()) + self.category_ids.update(detection_classes.numpy()) + + # Add ground-truth annotations. + groundtruth_annotations = coco_tools.ExportSingleImageGroundtruthToCoco( + self.image_id, + self.next_groundtruth_annotation_id, + self.category_ids, + groundtruth_boxes.numpy(), + groundtruth_classes.numpy(), + groundtruth_masks=groundtruth_masks.numpy(), + groundtruth_is_crowd=groundtruth_is_crowd.numpy()) + self.next_groundtruth_annotation_id += len(groundtruth_annotations) + + # Add to set of images for which there are gt & detections + # Infers image size from groundtruth masks. + _, height, width = groundtruth_masks.shape + self.dataset['images'].append({ + 'id': self.image_id, + 'height': height, + 'width': width, + }) + self.dataset['annotations'].extend(groundtruth_annotations) + + # Add predictions/detections. + detection_annotations = coco_tools.ExportSingleImageDetectionMasksToCoco( + self.image_id, self.category_ids, detection_masks.numpy(), + detection_scores.numpy(), detection_classes.numpy()) + self.detections.extend(detection_annotations) + + self.image_id += 1 + + +def _instance_masks(panoptic_label_map: tf.Tensor, + instance_panoptic_labels: tf.Tensor) -> tf.Tensor: + """Constructs an array of masks for each instance in a panoptic label map. + + Args: + panoptic_label_map: An integer tensor of shape `[image_height, image_width]` + specifying the panoptic label at each pixel. + instance_panoptic_labels: An integer tensor of shape `[num_instances]` that + gives the label for each unique instance for which to compute masks. + + Returns: + A boolean tensor of shape `[num_instances, image_height, image_width]` where + each slice in the first dimension gives the mask for a single instance over + the entire image. + """ + return tf.math.equal( + tf.expand_dims(panoptic_label_map, 0), + tf.reshape(instance_panoptic_labels, + [tf.size(instance_panoptic_labels), 1, 1])) + + +class PanopticInstanceAveragePrecision(tf.keras.metrics.Metric): + """Computes instance segmentation AP of panoptic segmentations. + + Panoptic segmentation includes both "thing" and "stuff" classes. This class + ignores the "stuff" classes to report metrics on only the "thing" classes + that have discrete instances. It computes a series of AP-based metrics using + the COCO evaluation scripts. + """ + + def __init__(self, + num_classes: int, + things_list: Collection[int], + label_divisor: int, + ignored_label: int, + name: str = 'panoptic_instance_ap', + **kwargs): + """Constructs panoptic instance segmentation evaluation class.""" + super(PanopticInstanceAveragePrecision, self).__init__(name=name, **kwargs) + self.num_classes = num_classes + self.stuff_list = set(range(num_classes)).difference(things_list) + self.label_divisor = label_divisor + self.ignored_label = ignored_label + self.detection_metric = InstanceAveragePrecision() + self.reset_states() + + def reset_states(self) -> None: + self.detection_metric.reset_states() + + def result(self) -> np.ndarray: + return self.detection_metric.result() + + def update_state(self, + groundtruth_panoptic: tf.Tensor, + predicted_panoptic: tf.Tensor, + semantic_probability: tf.Tensor, + instance_score_map: tf.Tensor, + is_crowd_map: Optional[tf.Tensor] = None) -> None: + """Adds the results from a new image to be computed by the metric. + + Args: + groundtruth_panoptic: A 2D integer tensor, with the true panoptic label at + each pixel. + predicted_panoptic: 2D integer tensor with predicted panoptic labels to be + evaluated. + semantic_probability: An float tensor of shape `[image_height, + image_width, num_classes]`. Specifies at each pixel the estimated + probability distribution that that pixel belongs to each semantic class. + instance_score_map: A 2D float tensor, where the pixels for an instance + will have the probability of that being an instance. + is_crowd_map: A 2D boolean tensor. Where it is True, the instance in that + region is a "crowd" instance. It is assumed that all pixels in an + instance will have the same value in this map. If set to None (the + default), it will be assumed that none of the ground truth instances are + crowds. + """ + classes_to_ignore = tf.convert_to_tensor([self.ignored_label] + + list(self.stuff_list), tf.int32) + (gt_unique_labels, + gt_box_coords) = panoptic_instances.instance_boxes_from_masks( + groundtruth_panoptic, classes_to_ignore, self.label_divisor) + gt_classes = tf.math.floordiv(gt_unique_labels, self.label_divisor) + + gt_masks = _instance_masks(groundtruth_panoptic, gt_unique_labels) + + if is_crowd_map is None: + gt_is_crowd = tf.zeros(tf.shape(gt_classes), tf.bool) + else: + gt_is_crowd = panoptic_instances.per_instance_is_crowd( + is_crowd_map, groundtruth_panoptic, gt_unique_labels) + + (pred_unique_labels, + pred_scores) = panoptic_instances.combined_instance_scores( + predicted_panoptic, semantic_probability, instance_score_map, + self.label_divisor, self.ignored_label) + + # Filter out stuff and ignored label. + pred_classes = tf.math.floordiv(pred_unique_labels, self.label_divisor) + pred_class_is_ignored = tf.math.reduce_any( + tf.math.equal( + tf.expand_dims(pred_classes, 1), + tf.expand_dims(classes_to_ignore, 0)), + axis=1) + pred_class_is_kept = tf.math.logical_not(pred_class_is_ignored) + pred_unique_labels = tf.boolean_mask(pred_unique_labels, pred_class_is_kept) + pred_scores = tf.boolean_mask(pred_scores, pred_class_is_kept) + + # Recompute class labels after the filtering. + pred_classes = tf.math.floordiv(pred_unique_labels, self.label_divisor) + pred_masks = _instance_masks(predicted_panoptic, pred_unique_labels) + + self.detection_metric.update_state(gt_box_coords, gt_classes, gt_masks, + gt_is_crowd, pred_masks, pred_scores, + pred_classes) diff --git a/evaluation/coco_instance_ap_test.py b/evaluation/coco_instance_ap_test.py new file mode 100644 index 0000000000000000000000000000000000000000..efc11d829e46e8888bde650ab44025c7ac98fda3 --- /dev/null +++ b/evaluation/coco_instance_ap_test.py @@ -0,0 +1,316 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the COCO Instance AP metric.""" + +from absl import logging +import numpy as np +import tensorflow as tf + +from deeplab2.evaluation import coco_instance_ap +from deeplab2.evaluation import test_utils + +# See the definition of the color names at: +# https://en.wikipedia.org/wiki/Web_colors. +_CLASS_COLOR_MAP = { + (0, 0, 0): 0, + (0, 0, 255): 1, # Person (blue). + (255, 0, 0): 2, # Bear (red). + (0, 255, 0): 3, # Tree (lime). + (255, 0, 255): 4, # Bird (fuchsia). + (0, 255, 255): 5, # Sky (aqua). + (255, 255, 0): 6, # Cat (yellow). +} + + +def combine_maps(semantic_map, instance_map, label_divisor): + combined_map = instance_map + semantic_map * label_divisor + return tf.cast(combined_map, tf.int32) + + +class CocoInstanceApTest(tf.test.TestCase): + + def test_evaluates_single_image(self): + groundtruth_boxes = [ + [0.25, 0.4, 0.75, 1.0], + ] + groundtruth_classes = [8] + groundtruth_masks = [[ + [0, 0, 0, 0, 0], + [0, 0, 1, 1, 0], + [0, 0, 1, 1, 1], + [0, 0, 0, 0, 0], + ]] + groundtruth_is_crowd = [False] + + detection_masks = [[ + [0, 0, 0, 0, 0], + [0, 0, 1, 1, 0], + [0, 0, 1, 1, 0], + [0, 0, 0, 0, 0], + ]] + detection_scores = [0.8] + detection_classes = [8] + + groundtruth_boxes = tf.constant(groundtruth_boxes, dtype=tf.float32) + groundtruth_classes = tf.constant(groundtruth_classes, dtype=tf.int32) + groundtruth_masks = tf.constant(groundtruth_masks, dtype=tf.uint8) + groundtruth_is_crowd = tf.constant(groundtruth_is_crowd, dtype=tf.bool) + + detection_masks = tf.constant(detection_masks, dtype=tf.uint8) + detection_scores = tf.constant(detection_scores, dtype=tf.float32) + detection_classes = tf.constant(detection_classes, dtype=tf.int32) + + metric_obj = coco_instance_ap.InstanceAveragePrecision() + metric_obj.update_state(groundtruth_boxes, groundtruth_classes, + groundtruth_masks, groundtruth_is_crowd, + detection_masks, detection_scores, + detection_classes) + result = metric_obj.result().numpy() + + # The IoU for the foreground match is 0.8. So it is a TP for 7/10 of the IoU + # thresholds. + expected_result = [0.7, 1, 1, 0.7, -1, -1, 0.7, 0.7, 0.7, 0.7, -1, -1] + np.testing.assert_array_almost_equal(result, expected_result) + + +class PanopticInstanceApTest(tf.test.TestCase): + + def test_evaluates_single_image(self): + num_classes = 3 + things_list = [1, 2] + label_divisor = 256 + ignore_label = 0 + instance_class_map = { + 0: 0, + 47: 1, + 97: 1, + 133: 1, + 150: 1, + 174: 1, + 198: 2, + 215: 1, + 244: 1, + 255: 1, + } + gt_instances, gt_classes = test_utils.panoptic_segmentation_with_class_map( + 'team_gt_instance.png', instance_class_map) + gt_panoptic = combine_maps(gt_classes, gt_instances, label_divisor) + + pred_classes = test_utils.read_segmentation_with_rgb_color_map( + 'team_pred_class.png', _CLASS_COLOR_MAP) + pred_instances = test_utils.read_test_image( + 'team_pred_instance.png', image_format='L') + + pred_panoptic = combine_maps(pred_classes, pred_instances, label_divisor) + semantic_probability = tf.ones( + tf.concat([tf.shape(pred_panoptic), [num_classes]], 0)) + instance_score_map = tf.ones(tf.shape(pred_panoptic)) + + metric_obj = coco_instance_ap.PanopticInstanceAveragePrecision( + num_classes, things_list, label_divisor, ignore_label) + metric_obj.update_state(gt_panoptic, pred_panoptic, semantic_probability, + instance_score_map) + + result = metric_obj.result().numpy() + logging.info('result = %s', result) + + expected_result = [ + 0.2549, 0.9356, 0.1215, -1.0, 0.2399, 0.501, 0.0812, 0.2688, 0.2688, + -1.0, 0.2583, 0.5 + ] + np.testing.assert_almost_equal(result, expected_result, decimal=4) + + def test_evaluates_with_scores(self): + num_classes = 3 + things_list = list(range(num_classes)) + label_divisor = 256 + ignore_label = 0 + gt_classes = tf.constant([ + [1, 1, 2, 2], + [1, 1, 2, 2], + [0, 0, 2, 2], + [0, 0, 2, 2], + ], tf.int32) + pred_classes = tf.constant([ + [1, 1, 1, 1], + [1, 1, 1, 1], + [0, 0, 2, 2], + [0, 0, 2, 2], + ], tf.int32) + instances = tf.constant([ + [1, 1, 2, 2], + [1, 1, 2, 2], + [0, 0, 3, 3], + [0, 0, 3, 3], + ], tf.int32) + + gt_panoptic = combine_maps(gt_classes, instances, label_divisor) + pred_panoptic = combine_maps(pred_classes, instances, label_divisor) + + semantic_probability = tf.constant([ + [ + [0, 0, 0, 0], + [0, 0, 0, 0], + [1, 1, 0, 0], + [1, 1, 0, 0], + ], + [ + [1, 1, 1, 1], + [1, 1, 1, 1], + [0, 0, 0, 0], + [0, 0, 0, 0], + ], + [ + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 1, 1], + [0, 0, 1, 1], + ], + ], tf.float32) + semantic_probability = tf.transpose(semantic_probability, [1, 2, 0]) + + # This score map gives higher score to the incorrect instance. + bad_instance_scores = tf.constant([ + [0.4, 0.4, 0.9, 0.9], + [0.4, 0.4, 0.9, 0.9], + [0.0, 0.0, 0.8, 0.8], + [0.0, 0.0, 0.8, 0.8], + ], tf.float32) + metric_obj = coco_instance_ap.PanopticInstanceAveragePrecision( + num_classes, things_list, label_divisor, ignore_label) + metric_obj.update_state(gt_panoptic, pred_panoptic, semantic_probability, + bad_instance_scores) + + bad_result = metric_obj.result().numpy() + logging.info('bad_result = %s', bad_result) + expected_bad_result = [ + 0.5025, 0.5025, 0.5025, 0.5025, -1., -1., 0.25, 0.75, 0.75, 0.75, -1., + -1. + ] + np.testing.assert_almost_equal(bad_result, expected_bad_result, decimal=4) + + # This score map gives lower score to the incorrect instance. + good_instance_scores = tf.constant([ + [0.9, 0.9, 0.4, 0.4], + [0.9, 0.9, 0.4, 0.4], + [0.0, 0.0, 0.8, 0.8], + [0.0, 0.0, 0.8, 0.8], + ], tf.float32) + metric_obj.reset_states() + metric_obj.update_state(gt_panoptic, pred_panoptic, semantic_probability, + good_instance_scores) + + good_result = metric_obj.result().numpy() + logging.info('good_result = %s', good_result) + + # Since the correct instance(s) have higher score, the "good" scores should + # give a result with higher AP. + expected_good_result = [ + 0.75248, 0.75248, 0.75248, 0.75248, -1, -1, 0.75, 0.75, 0.75, 0.75, -1, + -1 + ] + np.testing.assert_almost_equal(good_result, expected_good_result, decimal=4) + + def test_ignores_crowds(self): + num_classes = 3 + things_list = list(range(num_classes)) + label_divisor = 256 + ignore_label = 0 + gt_classes = tf.constant([ + [1, 1, 2, 2], + [1, 1, 2, 2], + [0, 0, 2, 2], + [0, 0, 2, 2], + ], tf.int32) + pred_classes = tf.constant([ + [1, 1, 1, 1], + [1, 1, 1, 1], + [0, 0, 2, 2], + [0, 0, 2, 2], + ], tf.int32) + instances = tf.constant([ + [1, 1, 2, 2], + [1, 1, 2, 2], + [0, 0, 3, 3], + [0, 0, 3, 3], + ], tf.int32) + is_crowd_map = tf.math.equal(instances, 2) + + gt_panoptic = combine_maps(gt_classes, instances, label_divisor) + pred_panoptic = combine_maps(pred_classes, instances, label_divisor) + + semantic_probability = tf.ones( + tf.concat([tf.shape(pred_panoptic), [num_classes]], 0)) + instance_score_map = tf.ones(tf.shape(pred_panoptic)) + + metric_obj = coco_instance_ap.PanopticInstanceAveragePrecision( + num_classes, things_list, label_divisor, ignore_label) + metric_obj.update_state(gt_panoptic, pred_panoptic, semantic_probability, + instance_score_map, is_crowd_map) + + result = metric_obj.result().numpy() + logging.info('result = %s', result) + # Expect perfect results (for the quantities that have an AP value), because + # the only mistake is a "crowd" instance. + expected_result = [1., 1., 1., 1., -1., -1., 1., 1., 1., 1., -1., -1.] + np.testing.assert_almost_equal(result, expected_result, decimal=4) + + def test_ignores_stuff(self): + num_classes = 4 + things_list = [3] + label_divisor = 256 + ignore_label = 0 + gt_classes = tf.constant([ + [3, 3, 2, 2], + [3, 3, 2, 2], + [0, 0, 2, 2], + [0, 0, 2, 2], + ], tf.int32) + pred_classes = tf.constant([ + [3, 3, 1, 1], + [3, 3, 1, 1], + [0, 0, 2, 2], + [0, 0, 2, 2], + ], tf.int32) + instances = tf.constant([ + [1, 1, 2, 2], + [1, 1, 2, 2], + [0, 0, 3, 3], + [0, 0, 3, 3], + ], tf.int32) + + gt_panoptic = combine_maps(gt_classes, instances, label_divisor) + pred_panoptic = combine_maps(pred_classes, instances, label_divisor) + + semantic_probability = tf.ones( + tf.concat([tf.shape(pred_panoptic), [num_classes]], 0)) + instance_score_map = tf.ones(tf.shape(pred_panoptic)) + + metric_obj = coco_instance_ap.PanopticInstanceAveragePrecision( + num_classes, things_list, label_divisor, ignore_label) + metric_obj.update_state(gt_panoptic, pred_panoptic, semantic_probability, + instance_score_map) + + result = metric_obj.result().numpy() + logging.info('result = %s', result) + # Expect perfect results (for the quantities that have an AP value), because + # the mistakes are all in "stuff" classes. + expected_result = [1., 1., 1., 1., -1., -1., 1., 1., 1., 1., -1., -1.] + np.testing.assert_almost_equal(result, expected_result, decimal=4) + + +if __name__ == '__main__': + tf.test.main() diff --git a/evaluation/depth_aware_segmentation_and_tracking_quality.py b/evaluation/depth_aware_segmentation_and_tracking_quality.py new file mode 100644 index 0000000000000000000000000000000000000000..8bbbb637a48bc480d83dfb8cc0f70998a5729c64 --- /dev/null +++ b/evaluation/depth_aware_segmentation_and_tracking_quality.py @@ -0,0 +1,210 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implementation of Depth-aware Segmentation and Tracking Quality (DSTQ) metric.""" + +import collections +from typing import Sequence, List, Tuple +import tensorflow as tf +from deeplab2.evaluation import segmentation_and_tracking_quality as stq + + +class DSTQuality(stq.STQuality): + """Metric class for Depth-aware Segmentation and Tracking Quality (DSTQ). + + This metric computes STQ and the inlier depth metric (or depth quality (DQ)) + under several thresholds. Then it returns the geometric mean of DQ's, AQ and + IoU to get the final DSTQ, i.e., + + DSTQ@{threshold_1} = pow(STQ ** 2 * DQ@{threshold_1}, 1/3) + DSTQ@{threshold_2} = pow(STQ ** 2 * DQ@{threshold_2}, 1/3) + ... + DSTQ = pow(STQ ** 2 * DQ, 1/3) + + where DQ = pow(prod_i^n(threshold_i), 1/n) for n depth thresholds. + + The default choices for depth thresholds are 1.1 and 1.25, i.e., + max(pred/gt, gt/pred) <= 1.1 and max(pred/gt, gt/pred) <= 1.25. + Commonly used thresholds for the inlier metrics are 1.25, 1.25**2, 1.25**3. + These thresholds are so loose that many methods achieves > 99%. + Therefore, we choose 1.25 and 1.1 to encourage high-precision predictions. + + Example usage: + + dstq_obj = depth_aware_segmentation_and_tracking_quality.DSTQuality( + num_classes, things_list, ignore_label, max_instances_per_category, + offset, depth_threshold) + dstq.update_state(y_true_1, y_pred_1, d_true_1, d_pred_1) + dstq.update_state(y_true_2, y_pred_2, d_true_2, d_pred_2) + ... + result = dstq_obj.result().numpy() + """ + + _depth_threshold: Tuple[float, float] = (1.25, 1.1) + _depth_total_counts: collections.OrderedDict + _depth_inlier_counts: List[collections.OrderedDict] + + def __init__(self, + num_classes: int, + things_list: Sequence[int], + ignore_label: int, + max_instances_per_category: int, + offset: int, + depth_threshold: Tuple[float] = (1.25, 1.1), + name: str = 'dstq',): # pytype: disable=annotation-type-mismatch + """Initialization of the DSTQ metric. + + Args: + num_classes: Number of classes in the dataset as an integer. + things_list: A sequence of class ids that belong to `things`. + ignore_label: The class id to be ignored in evaluation as an integer or + integer tensor. + max_instances_per_category: The maximum number of instances for each class + as an integer or integer tensor. + offset: The maximum number of unique labels as an integer or integer + tensor. + depth_threshold: A sequence of depth thresholds for the depth quality. + (default: (1.25, 1.1)) + name: An optional name. (default: 'dstq') + """ + super().__init__(num_classes, things_list, ignore_label, + max_instances_per_category, offset, name) + if not (isinstance(depth_threshold, tuple) or + isinstance(depth_threshold, list)): + raise TypeError('The type of depth_threshold must be tuple or list.') + if not depth_threshold: + raise ValueError('depth_threshold must be non-empty.') + self._depth_threshold = tuple(depth_threshold) + self._depth_total_counts = collections.OrderedDict() + self._depth_inlier_counts = [] + for _ in range(len(self._depth_threshold)): + self._depth_inlier_counts.append(collections.OrderedDict()) + + def update_state(self, + y_true: tf.Tensor, + y_pred: tf.Tensor, + d_true: tf.Tensor, + d_pred: tf.Tensor, + sequence_id: int = 0): + """Accumulates the depth-aware segmentation and tracking quality statistics. + + Args: + y_true: The ground-truth panoptic label map for a particular video frame + (defined as semantic_map * max_instances_per_category + instance_map). + y_pred: The predicted panoptic label map for a particular video frame + (defined as semantic_map * max_instances_per_category + instance_map). + d_true: The ground-truth depth map for this video frame. + d_pred: The predicted depth map for this video frame. + sequence_id: The optional ID of the sequence the frames belong to. When no + sequence is given, all frames are considered to belong to the same + sequence (default: 0). + """ + super().update_state(y_true, y_pred, sequence_id) + # Valid depth labels contain positive values. + d_valid_mask = d_true > 0 + d_valid_total = tf.reduce_sum(tf.cast(d_valid_mask, tf.int32)) + # Valid depth prediction is expected to contain positive values. + d_valid_mask = tf.logical_and(d_valid_mask, d_pred > 0) + d_valid_true = tf.boolean_mask(d_true, d_valid_mask) + d_valid_pred = tf.boolean_mask(d_pred, d_valid_mask) + inlier_error = tf.maximum(d_valid_pred / d_valid_true, + d_valid_true / d_valid_pred) + # For each threshold, count the number of inliers. + for threshold_index, threshold in enumerate(self._depth_threshold): + num_inliers = tf.reduce_sum(tf.cast(inlier_error <= threshold, tf.int32)) + inlier_counts = self._depth_inlier_counts[threshold_index] + inlier_counts[sequence_id] = (inlier_counts.get(sequence_id, 0) + + int(num_inliers.numpy())) + # Update the total counts of the depth labels. + self._depth_total_counts[sequence_id] = ( + self._depth_total_counts.get(sequence_id, 0) + + int(d_valid_total.numpy())) + + def result(self): + """Computes the depth-aware segmentation and tracking quality. + + Returns: + A dictionary containing: + - 'STQ': The total STQ score. + - 'AQ': The total association quality (AQ) score. + - 'IoU': The total mean IoU. + - 'STQ_per_seq': A list of the STQ score per sequence. + - 'AQ_per_seq': A list of the AQ score per sequence. + - 'IoU_per_seq': A list of mean IoU per sequence. + - 'Id_per_seq': A list of sequence Ids to map list index to sequence. + - 'Length_per_seq': A list of the length of each sequence. + - 'DSTQ': The total DSTQ score. + - 'DSTQ@thres': The total DSTQ score for threshold thres + - 'DSTQ_per_seq@thres': A list of DSTQ score per sequence for thres. + - 'DQ': The total DQ score. + - 'DQ@thres': The total DQ score for threshold thres. + - 'DQ_per_seq@thres': A list of DQ score per sequence for thres. + """ + # Gather the results for STQ. + stq_results = super().result() + # Collect results for depth quality per sequecne and threshold. + dq_per_seq_at_threshold = {} + dq_at_threshold = {} + for threshold_index, threshold in enumerate(self._depth_threshold): + dq_per_seq_at_threshold[threshold] = [0] * len(self._ground_truth) + total_count = 0 + inlier_count = 0 + # Follow the order of computing STQ by enumerating _ground_truth. + for index, sequence_id in enumerate(self._ground_truth): + sequence_inlier = self._depth_inlier_counts[threshold_index][ + sequence_id] + sequence_total = self._depth_total_counts[sequence_id] + if sequence_total > 0: + dq_per_seq_at_threshold[threshold][ + index] = sequence_inlier / sequence_total + total_count += sequence_total + inlier_count += sequence_inlier + if total_count == 0: + dq_at_threshold[threshold] = 0 + else: + dq_at_threshold[threshold] = inlier_count / total_count + # Compute DQ as the geometric mean of DQ's at different thresholds. + dq = 1 + for _, threshold in enumerate(self._depth_threshold): + dq *= dq_at_threshold[threshold] + dq = dq ** (1 / len(self._depth_threshold)) + dq_results = {} + dq_results['DQ'] = dq + for _, threshold in enumerate(self._depth_threshold): + dq_results['DQ@{}'.format(threshold)] = dq_at_threshold[threshold] + dq_results['DQ_per_seq@{}'.format( + threshold)] = dq_per_seq_at_threshold[threshold] + # Combine STQ and DQ to get DSTQ. + dstq_results = {} + dstq_results['DSTQ'] = (stq_results['STQ'] ** 2 * dq) ** (1/3) + for _, threshold in enumerate(self._depth_threshold): + dstq_results['DSTQ@{}'.format(threshold)] = ( + stq_results['STQ'] ** 2 * dq_at_threshold[threshold]) ** (1/3) + dstq_results['DSTQ_per_seq@{}'.format(threshold)] = [ + (stq_result**2 * dq_result)**(1 / 3) for stq_result, dq_result in zip( + stq_results['STQ_per_seq'], dq_per_seq_at_threshold[threshold]) + ] + # Merge all the results. + dstq_results.update(stq_results) + dstq_results.update(dq_results) + return dstq_results + + def reset_states(self): + """Resets all states that accumulated data.""" + super().reset_states() + self._depth_total_counts = collections.OrderedDict() + self._depth_inlier_counts = [] + for _ in range(len(self._depth_threshold)): + self._depth_inlier_counts.append(collections.OrderedDict()) diff --git a/evaluation/depth_aware_segmentation_and_tracking_quality_test.py b/evaluation/depth_aware_segmentation_and_tracking_quality_test.py new file mode 100644 index 0000000000000000000000000000000000000000..222ea0bc62f46fd36c3044515682416d9424e5df --- /dev/null +++ b/evaluation/depth_aware_segmentation_and_tracking_quality_test.py @@ -0,0 +1,283 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for depth_aware_segmentation_and_tracking_quality.""" + +import numpy as np +import tensorflow as tf + +from deeplab2.evaluation import depth_aware_segmentation_and_tracking_quality as dstq + + +class DepthAwareSegmentationAndTrackingQualityTest(tf.test.TestCase): + + def test_complex_example(self): + n_classes = 3 + ignore_label = 255 + # classes = ['sky', 'vegetation', 'cars']. + things_list = [2] + max_instances_per_category = 1000 + + ground_truth_semantic_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 2, 0, 1, 1, 1], + [0, 2, 2, 2, 2, 1, 1, 1], + [2, 2, 2, 2, 2, 2, 1, 1], + [2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 1, 1]]) + ground_truth_semantic_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 2, 0, 0, 1, 1, 0, 0], + [2, 2, 2, 1, 1, 1, 1, 0], + [2, 2, 2, 2, 1, 1, 1, 1], + [2, 2, 2, 2, 2, 1, 1, 1], + [2, 2, 2, 2, 2, 1, 1, 1], + [2, 2, 2, 2, 1, 1, 1, 1]]) + ground_truth_semantic_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [2, 0, 1, 1, 1, 0, 0, 0], + [2, 2, 1, 1, 1, 1, 0, 0], + [2, 2, 2, 1, 1, 1, 1, 0], + [2, 2, 2, 1, 1, 1, 1, 1], + [2, 2, 2, 1, 1, 1, 1, 1]]) + ground_truth_semantic = np.stack([ + ground_truth_semantic_1, ground_truth_semantic_2, + ground_truth_semantic_3 + ]) + + ground_truth_instance_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 2, 0, 0, 0, 0], + [0, 2, 2, 2, 2, 0, 0, 0], + [2, 2, 2, 2, 2, 2, 0, 0], + [2, 2, 2, 2, 2, 2, 2, 0], + [2, 2, 2, 2, 2, 2, 2, 0], + [2, 2, 2, 2, 2, 2, 0, 0]]) + ground_truth_instance_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 2, 0, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0], + [2, 2, 2, 2, 0, 0, 0, 0], + [2, 2, 2, 2, 2, 0, 0, 0], + [2, 2, 2, 2, 2, 0, 0, 0], + [2, 2, 2, 2, 0, 0, 0, 0]]) + ground_truth_instance_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [2, 0, 0, 0, 0, 0, 0, 0], + [2, 2, 0, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0]]) + + ground_truth_instance = np.stack([ + ground_truth_instance_1, ground_truth_instance_2, + ground_truth_instance_3 + ]) + ground_truth = (ground_truth_semantic * max_instances_per_category + + ground_truth_instance) + + prediction_semantic_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 2, 2, 1, 1, 1], + [0, 2, 2, 2, 2, 2, 1, 1], + [2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 2, 1]]) + prediction_semantic_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 1, 0, 0], + [0, 2, 2, 2, 1, 1, 1, 1], + [2, 2, 2, 2, 1, 1, 1, 1], + [2, 2, 2, 2, 2, 1, 1, 1], + [2, 2, 2, 2, 2, 2, 1, 1], + [2, 2, 2, 2, 2, 1, 1, 1]]) + prediction_semantic_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0], + [0, 0, 1, 1, 1, 1, 0, 0], + [2, 2, 2, 1, 1, 1, 0, 0], + [2, 2, 2, 1, 1, 1, 1, 1], + [2, 2, 2, 2, 1, 1, 1, 1], + [2, 2, 2, 2, 1, 1, 1, 1]]) + prediction_semantic = np.stack( + [prediction_semantic_1, prediction_semantic_2, prediction_semantic_3]) + + prediction_instance_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 2, 2, 0, 0, 0], + [0, 2, 2, 2, 2, 1, 0, 0], + [2, 2, 2, 2, 2, 1, 1, 0], + [2, 2, 2, 2, 1, 1, 1, 0], + [2, 2, 2, 2, 1, 1, 1, 0]]) + prediction_instance_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 2, 2, 2, 0, 0, 0, 0], + [2, 2, 2, 2, 0, 0, 0, 0], + [2, 2, 2, 2, 2, 0, 0, 0], + [2, 2, 2, 2, 1, 1, 0, 0], + [2, 2, 2, 2, 1, 0, 0, 0]]) + prediction_instance_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0], + [2, 2, 2, 2, 0, 0, 0, 0], + [2, 2, 2, 2, 0, 0, 0, 0]]) + prediction_instance = np.stack( + [prediction_instance_1, prediction_instance_2, prediction_instance_3]) + prediction = (prediction_semantic * max_instances_per_category + + prediction_instance) + + ground_truth_depth = np.array( + [[56.1, 50.9, 54.0, 63.6, 68.6, 50.9, 50.9, 58.1], + [62.6, 52.1, 00.0, 60.9, 62.4, 52.6, 56.3, 63.4], + [57.1, 61.2, 63.8, 63.1, 52.3, 54.3, 52.1, 51.4], + [65.8, 50.5, 58.9, 54.3, 00.0, 65.4, 63.8, 56.8], + [50.6, 56.5, 53.0, 66.9, 51.8, 58.6, 65.9, 66.4], + [53.5, 56.2, 53.6, 50.6, 64.6, 51.1, 68.7, 50.3], + [69.0, 65.3, 66.4, 51.9, 68.3, 50.5, 00.0, 67.4], + [59.7, 51.3, 50.1, 67.2, 68.8, 62.8, 64.9, 59.5]]) + prediction_depth = np.array( + [[67.5, 36.9, 65.7, 77.9, 75.0, 45.1, 68.2, 63.3], + [43.8, 63.0, 79.4, 78.1, 82.2, 36.9, 59.2, 83.2], + [70.6, 73.2, 77.8, 71.3, 41.3, 47.5, 58.8, 64.8], + [60.5, 51.7, 72.2, 49.8, 56.1, 60.7, 72.2, 73.0], + [34.5, 55.7, 46.7, 47.4, 69.6, 43.5, 82.3, 84.8], + [46.9, 39.5, 35.4, 61.3, 79.4, 42.2, 48.9, 56.3], + [57.0, 75.0, 84.2, 46.3, 67.4, 55.5, 46.9, 70.0], + [62.3, 58.3, 59.4, 74.5, 70.6, 54.6, 78.6, 48.1]]) + + with self.subTest('No valid depth labels'): + # Compute DSTQuality. + dstq_metric = dstq.DSTQuality( + n_classes, things_list, ignore_label, max_instances_per_category, + 256 * 256, (1.25, 1.1)) + no_valid_ground_truth_depth = ground_truth_depth * 0 + + for i in range(3): + dstq_metric.update_state( + tf.convert_to_tensor(ground_truth[i, ...], dtype=tf.int32), + tf.convert_to_tensor(prediction[i, ...], dtype=tf.int32), + tf.convert_to_tensor(no_valid_ground_truth_depth, dtype=tf.float32), + tf.convert_to_tensor(prediction_depth, dtype=tf.float32), + 1) + result = dstq_metric.result() + + # Check if additional implementations alter the STQ results. + # The example is copied from the complex example for testing STQ. + # The results are expected to be unchanged. + np.testing.assert_almost_equal(result['STQ'], 0.66841773352) + np.testing.assert_almost_equal(result['AQ'], 0.55366581415) + np.testing.assert_almost_equal(result['IoU'], 0.8069529580309542) + np.testing.assert_almost_equal(result['STQ_per_seq'], [0.66841773352]) + np.testing.assert_almost_equal(result['AQ_per_seq'], [0.55366581415]) + np.testing.assert_almost_equal(result['IoU_per_seq'], + [0.8069529580309542]) + np.testing.assert_almost_equal(result['ID_per_seq'], [1]) + np.testing.assert_almost_equal(result['Length_per_seq'], [3]) + # As there is no valid depth labels, any depth metrics should be 0. + np.testing.assert_almost_equal(result['DSTQ'], 0.0) + np.testing.assert_almost_equal(result['DSTQ@1.1'], 0.0) + np.testing.assert_almost_equal(result['DSTQ@1.25'], 0.0) + np.testing.assert_almost_equal(result['DSTQ_per_seq@1.1'], [0.0]) + np.testing.assert_almost_equal(result['DSTQ_per_seq@1.25'], [0.0]) + np.testing.assert_almost_equal(result['DQ'], 0.0) + np.testing.assert_almost_equal(result['DQ@1.1'], 0.0) + np.testing.assert_almost_equal(result['DQ@1.25'], 0.0) + np.testing.assert_almost_equal(result['DQ_per_seq@1.1'], [0.0]) + np.testing.assert_almost_equal(result['DQ_per_seq@1.25'], [0.0]) + + with self.subTest('Default depth thresholds'): + # Compute DSTQuality. + dstq_metric = dstq.DSTQuality( + n_classes, things_list, ignore_label, max_instances_per_category, + 256 * 256, (1.25, 1.1)) + + for i in range(3): + dstq_metric.update_state( + tf.convert_to_tensor(ground_truth[i, ...], dtype=tf.int32), + tf.convert_to_tensor(prediction[i, ...], dtype=tf.int32), + tf.convert_to_tensor(ground_truth_depth, dtype=tf.float32), + tf.convert_to_tensor(prediction_depth, dtype=tf.float32), + 1) + + result = dstq_metric.result() + # Prepare groundtruth metrics. + valid_depth_labels_total = np.sum(ground_truth_depth > 0) + valid_depth_labels = ground_truth_depth[ground_truth_depth > 0] + valid_depth_pred = prediction_depth[ground_truth_depth > 0] + valid_depth_error = np.maximum(valid_depth_pred / valid_depth_labels, + valid_depth_labels / valid_depth_pred) + dq_1_1 = np.sum(valid_depth_error <= 1.1) / valid_depth_labels_total + dq_1_25 = np.sum(valid_depth_error <= 1.25) / valid_depth_labels_total + + # Check if additional implementations alter the STQ results. + # The example is copied from the complex example for testing STQ. + # The results are expected to be unchanged. + np.testing.assert_almost_equal(result['STQ'], 0.66841773352) + np.testing.assert_almost_equal(result['AQ'], 0.55366581415) + np.testing.assert_almost_equal(result['IoU'], 0.8069529580309542) + np.testing.assert_almost_equal(result['STQ_per_seq'], [0.66841773352]) + np.testing.assert_almost_equal(result['AQ_per_seq'], [0.55366581415]) + np.testing.assert_almost_equal(result['IoU_per_seq'], + [0.8069529580309542]) + np.testing.assert_almost_equal(result['ID_per_seq'], [1]) + np.testing.assert_almost_equal(result['Length_per_seq'], [3]) + # Results are checked by groundtruth or equations. + np.testing.assert_almost_equal(result['DSTQ'] ** 3, + result['STQ'] ** 2 * result['DQ']) + np.testing.assert_almost_equal(result['DSTQ@1.1'] ** 3, + result['STQ'] ** 2 * result['DQ@1.1']) + np.testing.assert_almost_equal(result['DSTQ@1.25'] ** 3, + result['STQ'] ** 2 * result['DQ@1.25']) + np.testing.assert_almost_equal(result['DSTQ_per_seq@1.1'], + [result['DSTQ@1.1']]) + np.testing.assert_almost_equal(result['DSTQ_per_seq@1.25'], + [result['DSTQ@1.25']]) + np.testing.assert_almost_equal(result['DQ'] ** 2, + result['DQ@1.1'] * result['DQ@1.25']) + np.testing.assert_almost_equal(result['DQ@1.1'], dq_1_1) + np.testing.assert_almost_equal(result['DQ@1.25'], dq_1_25) + np.testing.assert_almost_equal(result['DQ_per_seq@1.1'], + [result['DQ@1.1']]) + np.testing.assert_almost_equal(result['DQ_per_seq@1.25'], + [result['DQ@1.25']]) + # Results are checked by real numbers. + np.testing.assert_almost_equal(result['DSTQ'], 0.5552059833215103) + np.testing.assert_almost_equal(result['DSTQ@1.1'], 0.45663565048742255) + np.testing.assert_almost_equal(result['DSTQ@1.25'], + 0.6750539157136957) + np.testing.assert_almost_equal(result['DSTQ_per_seq@1.1'], + [0.45663565048742255]) + np.testing.assert_almost_equal(result['DSTQ_per_seq@1.25'], + [0.6750539157136957]) + np.testing.assert_almost_equal(result['DQ'], 0.3830597195261614) + np.testing.assert_almost_equal(result['DQ@1.1'], 0.21311475409836064) + np.testing.assert_almost_equal(result['DQ@1.25'], 0.6885245901639344) + np.testing.assert_almost_equal(result['DQ_per_seq@1.1'], + [0.21311475409836064]) + np.testing.assert_almost_equal(result['DQ_per_seq@1.25'], + [0.6885245901639344]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/evaluation/panoptic_quality.py b/evaluation/panoptic_quality.py new file mode 100644 index 0000000000000000000000000000000000000000..8f8d089a0f725176acfd5f2b9fc3ffc63bdd802a --- /dev/null +++ b/evaluation/panoptic_quality.py @@ -0,0 +1,266 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implementation of the Panoptic Quality metric. + +Panoptic Quality is an instance-based metric for evaluating the task of +image parsing, aka panoptic segmentation. + +Please see the paper for details: +"Panoptic Segmentation", Alexander Kirillov, Kaiming He, Ross Girshick, +Carsten Rother and Piotr Dollar. arXiv:1801.00868, 2018. +""" + +from typing import Any, List, Mapping, Optional, Tuple + +import numpy as np +import tensorflow as tf + + +def _ids_to_counts(id_array: np.ndarray) -> Mapping[int, int]: + """Given a numpy array, a mapping from each unique entry to its count.""" + ids, counts = np.unique(id_array, return_counts=True) + return dict(zip(ids, counts)) + + +class PanopticQuality(tf.keras.metrics.Metric): + """Metric class for Panoptic Quality. + + "Panoptic Segmentation" by Alexander Kirillov, Kaiming He, Ross Girshick, + Carsten Rother, Piotr Dollar. + https://arxiv.org/abs/1801.00868 + + Stand-alone usage: + + pq_obj = panoptic_quality.PanopticQuality(num_classes, + max_instances_per_category, ignored_label) + pq_obj.update_state(y_true_1, y_pred_1) + pq_obj.update_state(y_true_2, y_pred_2) + ... + result = pq_obj.result().numpy() + """ + + def __init__(self, + num_classes: int, + ignored_label: int, + max_instances_per_category: int, + offset: int, + name: str = 'panoptic_quality', + **kwargs): + """Initialization of the PanopticQuality metric. + + Args: + num_classes: Number of classes in the dataset as an integer. + ignored_label: The class id to be ignored in evaluation as an integer or + integer tensor. + max_instances_per_category: The maximum number of instances for each class + as an integer or integer tensor. + offset: The maximum number of unique labels as an integer or integer + tensor. + name: An optional variable_scope name. (default: 'panoptic_quality') + **kwargs: The keyword arguments that are passed on to `fn`. + """ + super(PanopticQuality, self).__init__(name=name, **kwargs) + self.num_classes = num_classes + self.ignored_label = ignored_label + self.max_instances_per_category = max_instances_per_category + self.total_iou = self.add_weight( + 'total_iou', shape=(num_classes,), initializer=tf.zeros_initializer) + self.total_tp = self.add_weight( + 'total_tp', shape=(num_classes,), initializer=tf.zeros_initializer) + self.total_fn = self.add_weight( + 'total_fn', shape=(num_classes,), initializer=tf.zeros_initializer) + self.total_fp = self.add_weight( + 'total_fp', shape=(num_classes,), initializer=tf.zeros_initializer) + self.offset = offset + + def compare_and_accumulate( + self, gt_panoptic_label: tf.Tensor, pred_panoptic_label: tf.Tensor + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Compares predicted segmentation with groundtruth, accumulates its metric. + + It is not assumed that instance ids are unique across different categories. + See for example combine_semantic_and_instance_predictions.py in official + PanopticAPI evaluation code for issues to consider when fusing category + and instance labels. + + Instances ids of the ignored category have the meaning that id 0 is "void" + and remaining ones are crowd instances. + + Args: + gt_panoptic_label: A tensor that combines label array from categories and + instances for ground truth. + pred_panoptic_label: A tensor that combines label array from categories + and instances for the prediction. + + Returns: + The value of the metrics (iou, tp, fn, fp) over all comparisons, as a + float scalar. + """ + iou_per_class = np.zeros(self.num_classes, dtype=np.float64) + tp_per_class = np.zeros(self.num_classes, dtype=np.float64) + fn_per_class = np.zeros(self.num_classes, dtype=np.float64) + fp_per_class = np.zeros(self.num_classes, dtype=np.float64) + + # Pre-calculate areas for all groundtruth and predicted segments. + gt_segment_areas = _ids_to_counts(gt_panoptic_label.numpy()) + pred_segment_areas = _ids_to_counts(pred_panoptic_label.numpy()) + + # We assume the ignored segment has instance id = 0. + ignored_panoptic_id = self.ignored_label * self.max_instances_per_category + + # Next, combine the groundtruth and predicted labels. Dividing up the pixels + # based on which groundtruth segment and which predicted segment they belong + # to, this will assign a different 64-bit integer label to each choice + # of (groundtruth segment, predicted segment), encoded as + # gt_panoptic_label * offset + pred_panoptic_label. + intersection_id_array = tf.cast(gt_panoptic_label, + tf.int64) * self.offset + tf.cast( + pred_panoptic_label, tf.int64) + + # For every combination of (groundtruth segment, predicted segment) with a + # non-empty intersection, this counts the number of pixels in that + # intersection. + intersection_areas = _ids_to_counts(intersection_id_array.numpy()) + + # Compute overall ignored overlap. + def prediction_ignored_overlap(pred_panoptic_label): + intersection_id = ignored_panoptic_id * self.offset + pred_panoptic_label + return intersection_areas.get(intersection_id, 0) + + # Sets that are populated with which segments groundtruth/predicted segments + # have been matched with overlapping predicted/groundtruth segments + # respectively. + gt_matched = set() + pred_matched = set() + + # Calculate IoU per pair of intersecting segments of the same category. + for intersection_id, intersection_area in intersection_areas.items(): + gt_panoptic_label = intersection_id // self.offset + pred_panoptic_label = intersection_id % self.offset + + gt_category = gt_panoptic_label // self.max_instances_per_category + pred_category = pred_panoptic_label // self.max_instances_per_category + if gt_category != pred_category: + continue + if pred_category == self.ignored_label: + continue + + # Union between the groundtruth and predicted segments being compared does + # not include the portion of the predicted segment that consists of + # groundtruth "void" pixels. + union = ( + gt_segment_areas[gt_panoptic_label] + + pred_segment_areas[pred_panoptic_label] - intersection_area - + prediction_ignored_overlap(pred_panoptic_label)) + iou = intersection_area / union + if iou > 0.5: + tp_per_class[gt_category] += 1 + iou_per_class[gt_category] += iou + gt_matched.add(gt_panoptic_label) + pred_matched.add(pred_panoptic_label) + + # Count false negatives for each category. + for gt_panoptic_label in gt_segment_areas: + if gt_panoptic_label in gt_matched: + continue + category = gt_panoptic_label // self.max_instances_per_category + # Failing to detect a void segment is not a false negative. + if category == self.ignored_label: + continue + fn_per_class[category] += 1 + + # Count false positives for each category. + for pred_panoptic_label in pred_segment_areas: + if pred_panoptic_label in pred_matched: + continue + # A false positive is not penalized if is mostly ignored in the + # groundtruth. + if (prediction_ignored_overlap(pred_panoptic_label) / + pred_segment_areas[pred_panoptic_label]) > 0.5: + continue + category = pred_panoptic_label // self.max_instances_per_category + if category == self.ignored_label: + continue + fp_per_class[category] += 1 + return iou_per_class, tp_per_class, fn_per_class, fp_per_class + + def update_state( + self, + y_true: tf.Tensor, + y_pred: tf.Tensor, + sample_weight: Optional[tf.Tensor] = None) -> List[tf.Operation]: + """Accumulates the panoptic quality statistics. + + Args: + y_true: The ground truth panoptic label map (defined as semantic_map * + max_instances_per_category + instance_map). + y_pred: The predicted panoptic label map (defined as semantic_map * + max_instances_per_category + instance_map). + sample_weight: Optional weighting of each example. Defaults to 1. Can be a + `Tensor` whose rank is either 0, or the same rank as `y_true`, and must + be broadcastable to `y_true`. + + Returns: + Update ops for iou, tp, fn, fp. + """ + result = self.compare_and_accumulate(y_true, y_pred) + iou, tp, fn, fp = tuple(result) + update_iou_op = self.total_iou.assign_add(iou) + update_tp_op = self.total_tp.assign_add(tp) + update_fn_op = self.total_fn.assign_add(fn) + update_fp_op = self.total_fp.assign_add(fp) + return [update_iou_op, update_tp_op, update_fn_op, update_fp_op] + + def result(self) -> tf.Tensor: + """Computes the panoptic quality.""" + sq = tf.math.divide_no_nan(self.total_iou, self.total_tp) + rq = tf.math.divide_no_nan( + self.total_tp, + self.total_tp + 0.5 * self.total_fn + 0.5 * self.total_fp) + pq = tf.math.multiply(sq, rq) + + # Find the valid classes that will be used for evaluation. We will + # ignore classes which have (tp + fn + fp) equal to 0. + # The "ignore" label will be included in this based on logic that skips + # counting those instances/regions. + valid_classes = tf.not_equal(self.total_tp + self.total_fn + self.total_fp, + 0) + + # Compute averages over classes. + qualities = tf.stack( + [pq, sq, rq, self.total_tp, self.total_fn, self.total_fp], axis=0) + summarized_qualities = tf.math.reduce_mean( + tf.boolean_mask(qualities, valid_classes, axis=1), axis=1) + + return summarized_qualities + + def reset_states(self) -> None: + """See base class.""" + tf.keras.backend.set_value(self.total_iou, np.zeros(self.num_classes)) + tf.keras.backend.set_value(self.total_tp, np.zeros(self.num_classes)) + tf.keras.backend.set_value(self.total_fn, np.zeros(self.num_classes)) + tf.keras.backend.set_value(self.total_fp, np.zeros(self.num_classes)) + + def get_config(self) -> Mapping[str, Any]: + """See base class.""" + config = { + 'num_classes': self.num_classes, + 'ignored_label': self.ignored_label, + 'max_instances_per_category': self.max_instances_per_category, + 'offset': self.offset, + } + base_config = super(PanopticQuality, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/evaluation/panoptic_quality_test.py b/evaluation/panoptic_quality_test.py new file mode 100644 index 0000000000000000000000000000000000000000..ecef73fd8d93dbcac295f9f5431c1ba4cc08398b --- /dev/null +++ b/evaluation/panoptic_quality_test.py @@ -0,0 +1,214 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for panoptic_quality metrics.""" +import collections + +from absl import logging +import numpy as np +import tensorflow as tf + +from deeplab2.evaluation import panoptic_quality +from deeplab2.evaluation import test_utils + +# See the definition of the color names at: +# https://en.wikipedia.org/wiki/Web_colors. +_CLASS_COLOR_MAP = { + (0, 0, 0): 0, + (0, 0, 255): 1, # Person (blue). + (255, 0, 0): 2, # Bear (red). + (0, 255, 0): 3, # Tree (lime). + (255, 0, 255): 4, # Bird (fuchsia). + (0, 255, 255): 5, # Sky (aqua). + (255, 255, 0): 6, # Cat (yellow). +} + + +def combine_maps(semantic_map, instance_map, label_divisor): + combined_map = instance_map + semantic_map * label_divisor + return tf.cast(combined_map, tf.int32) + + +class PanopticQualityMetricTest(tf.test.TestCase): + + def test_streaming_metric_on_single_image(self): + max_instances_per_category = 1000 + instance_class_map = { + 0: 0, + 47: 1, + 97: 1, + 133: 1, + 150: 1, + 174: 1, + 198: 2, + 215: 1, + 244: 1, + 255: 1, + } + gt_instances, gt_classes = test_utils.panoptic_segmentation_with_class_map( + 'team_gt_instance.png', instance_class_map) + + pred_classes = test_utils.read_segmentation_with_rgb_color_map( + 'team_pred_class.png', _CLASS_COLOR_MAP) + pred_instances = test_utils.read_test_image( + 'team_pred_instance.png', image_format='L') + + pq_obj = panoptic_quality.PanopticQuality( + num_classes=3, + max_instances_per_category=max_instances_per_category, + ignored_label=0, offset=256*256) + + y_true = combine_maps(gt_classes, gt_instances, max_instances_per_category) + y_pred = combine_maps(pred_classes, pred_instances, + max_instances_per_category) + pq_obj.update_state(y_true, y_pred) + result = pq_obj.result().numpy() + self.assertAlmostEqual(result[0], 0.62156284, places=4) + self.assertAlmostEqual(result[1], 0.64664984, places=4) + self.assertAlmostEqual(result[2], 0.9666667, places=4) + self.assertEqual(result[3], 4.) + self.assertAlmostEqual(result[4], 0.5) + self.assertEqual(result[5], 0.) + + def test_streaming_metric_on_multiple_images(self): + num_classes = 7 + + bird_gt_instance_class_map = { + 92: 5, + 176: 3, + 255: 4, + } + cat_gt_instance_class_map = { + 0: 0, + 255: 6, + } + team_gt_instance_class_map = { + 0: 0, + 47: 1, + 97: 1, + 133: 1, + 150: 1, + 174: 1, + 198: 2, + 215: 1, + 244: 1, + 255: 1, + } + max_instances_per_category = 256 + test_image = collections.namedtuple( + 'TestImage', + ['gt_class_map', 'gt_path', 'pred_inst_path', 'pred_class_path']) + test_images = [ + test_image(bird_gt_instance_class_map, 'bird_gt.png', + 'bird_pred_instance.png', 'bird_pred_class.png'), + test_image(cat_gt_instance_class_map, 'cat_gt.png', + 'cat_pred_instance.png', 'cat_pred_class.png'), + test_image(team_gt_instance_class_map, 'team_gt_instance.png', + 'team_pred_instance.png', 'team_pred_class.png'), + ] + + gt_classes = [] + gt_instances = [] + pred_classes = [] + pred_instances = [] + for test_image in test_images: + (image_gt_instances, + image_gt_classes) = test_utils.panoptic_segmentation_with_class_map( + test_image.gt_path, test_image.gt_class_map) + gt_classes.append(image_gt_classes) + gt_instances.append(image_gt_instances) + + pred_classes.append( + test_utils.read_segmentation_with_rgb_color_map( + test_image.pred_class_path, _CLASS_COLOR_MAP)) + pred_instances.append( + test_utils.read_test_image(test_image.pred_inst_path, + image_format='L')) + + pq_obj = panoptic_quality.PanopticQuality( + num_classes=num_classes, + max_instances_per_category=max_instances_per_category, + ignored_label=0, offset=256*256) + for pred_class, pred_instance, gt_class, gt_instance in zip( + pred_classes, pred_instances, gt_classes, gt_instances): + y_true = combine_maps(gt_class, gt_instance, max_instances_per_category) + y_pred = combine_maps(pred_class, pred_instance, + max_instances_per_category) + pq_obj.update_state(y_true, y_pred) + result = pq_obj.result().numpy() + + self.assertAlmostEqual(result[0], 0.76855499, places=4) + self.assertAlmostEqual(result[1], 0.7769174, places=4) + self.assertAlmostEqual(result[2], 0.98888892, places=4) + self.assertEqual(result[3], 2.) + self.assertAlmostEqual(result[4], 1. / 6, places=4) + self.assertEqual(result[5], 0.) + + def test_predicted_non_contiguous_ignore_label(self): + max_instances_per_category = 256 + pq_obj = panoptic_quality.PanopticQuality( + num_classes=3, + max_instances_per_category=max_instances_per_category, + ignored_label=9, + offset=256 * 256) + + gt_class = [ + [0, 9, 9], + [1, 2, 2], + [1, 9, 9], + ] + gt_instance = [ + [0, 2, 2], + [1, 0, 0], + [1, 0, 0], + ] + y_true = combine_maps( + np.array(gt_class), np.array(gt_instance), max_instances_per_category) + logging.info('y_true=\n%s', y_true) + + pred_class = [ + [0, 0, 9], + [1, 1, 1], + [1, 9, 9], + ] + pred_instance = [ + [0, 0, 0], + [0, 1, 1], + [0, 1, 1], + ] + y_pred = combine_maps( + np.array(pred_class), np.array(pred_instance), + max_instances_per_category) + logging.info('y_pred=\n%s', y_pred) + + pq_obj.update_state(y_true, y_pred) + result = pq_obj.result().numpy() + + # pq + self.assertAlmostEqual(result[0], 2. / 9, places=4) + # sq + self.assertAlmostEqual(result[1], 1. / 3, places=4) + # rq + self.assertAlmostEqual(result[2], 2. / 9, places=4) + # tp + self.assertAlmostEqual(result[3], 1. / 3, places=4) + # fn + self.assertAlmostEqual(result[4], 2. / 3, places=4) + # fp + self.assertAlmostEqual(result[5], 2. / 3, places=4) + + +if __name__ == '__main__': + tf.test.main() diff --git a/evaluation/segmentation_and_tracking_quality.py b/evaluation/segmentation_and_tracking_quality.py new file mode 100644 index 0000000000000000000000000000000000000000..c6c3171c8c3e98cc265b296f7b9e44df190f0d9d --- /dev/null +++ b/evaluation/segmentation_and_tracking_quality.py @@ -0,0 +1,282 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implementation of the Segmentation and Tracking Quality (STQ) metric.""" + +import collections +from typing import MutableMapping, Sequence, Dict, Text, Any +import numpy as np +import tensorflow as tf + + +def _update_dict_stats(stat_dict: MutableMapping[int, tf.Tensor], + id_array: tf.Tensor): + """Updates a given dict with corresponding counts.""" + ids, _, counts = tf.unique_with_counts(id_array) + for idx, count in zip(ids.numpy(), counts): + if idx in stat_dict: + stat_dict[idx] += count + else: + stat_dict[idx] = count + + +class STQuality(object): + """Metric class for the Segmentation and Tracking Quality (STQ). + + The metric computes the geometric mean of two terms. + - Association Quality: This term measures the quality of the track ID + assignment for `thing` classes. It is formulated as a weighted IoU + measure. + - Segmentation Quality: This term measures the semantic segmentation quality. + The standard class IoU measure is used for this. + + Example usage: + + stq_obj = segmentation_tracking_quality.STQuality(num_classes, things_list, + ignore_label, max_instances_per_category, offset) + stq_obj.update_state(y_true_1, y_pred_1) + stq_obj.update_state(y_true_2, y_pred_2) + ... + result = stq_obj.result().numpy() + """ + + def __init__(self, + num_classes: int, + things_list: Sequence[int], + ignore_label: int, + max_instances_per_category: int, + offset: int, + name='stq' + ): + """Initialization of the STQ metric. + + Args: + num_classes: Number of classes in the dataset as an integer. + things_list: A sequence of class ids that belong to `things`. + ignore_label: The class id to be ignored in evaluation as an integer or + integer tensor. + max_instances_per_category: The maximum number of instances for each class + as an integer or integer tensor. + offset: The maximum number of unique labels as an integer or integer + tensor. + name: An optional name. (default: 'st_quality') + """ + self._name = name + self._num_classes = num_classes + self._ignore_label = ignore_label + self._things_list = things_list + self._max_instances_per_category = max_instances_per_category + + if ignore_label >= num_classes: + self._confusion_matrix_size = num_classes + 1 + self._include_indices = np.arange(self._num_classes) + else: + self._confusion_matrix_size = num_classes + self._include_indices = np.array( + [i for i in range(num_classes) if i != self._ignore_label]) + + self._iou_confusion_matrix_per_sequence = collections.OrderedDict() + self._predictions = collections.OrderedDict() + self._ground_truth = collections.OrderedDict() + self._intersections = collections.OrderedDict() + self._sequence_length = collections.OrderedDict() + self._offset = offset + lower_bound = num_classes * max_instances_per_category + if offset < lower_bound: + raise ValueError('The provided offset %d is too small. No guarantess ' + 'about the correctness of the results can be made. ' + 'Please choose an offset that is higher than num_classes' + ' * max_instances_per_category = %d' % lower_bound) + + def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, + sequence_id=0): + """Accumulates the segmentation and tracking quality statistics. + + Args: + y_true: The ground-truth panoptic label map for a particular video frame + (defined as semantic_map * max_instances_per_category + instance_map). + y_pred: The predicted panoptic label map for a particular video frame + (defined as semantic_map * max_instances_per_category + instance_map). + sequence_id: The optional ID of the sequence the frames belong to. When no + sequence is given, all frames are considered to belong to the same + sequence (default: 0). + """ + y_true = tf.cast(y_true, dtype=tf.int64) + y_pred = tf.cast(y_pred, dtype=tf.int64) + semantic_label = y_true // self._max_instances_per_category + semantic_prediction = y_pred // self._max_instances_per_category + # Check if the ignore value is outside the range [0, num_classes]. If yes, + # map `_ignore_label` to `_num_classes`, so it can be used to create the + # confusion matrix. + if self._ignore_label > self._num_classes: + semantic_label = tf.where( + tf.not_equal(semantic_label, self._ignore_label), semantic_label, + self._num_classes) + semantic_prediction = tf.where( + tf.not_equal(semantic_prediction, self._ignore_label), + semantic_prediction, self._num_classes) + if sequence_id in self._iou_confusion_matrix_per_sequence: + self._iou_confusion_matrix_per_sequence[sequence_id] += ( + tf.math.confusion_matrix( + tf.reshape(semantic_label, [-1]), + tf.reshape(semantic_prediction, [-1]), + self._confusion_matrix_size, + dtype=tf.int64)) + self._sequence_length[sequence_id] += 1 + else: + self._iou_confusion_matrix_per_sequence[sequence_id] = ( + tf.math.confusion_matrix( + tf.reshape(semantic_label, [-1]), + tf.reshape(semantic_prediction, [-1]), + self._confusion_matrix_size, + dtype=tf.int64)) + self._predictions[sequence_id] = {} + self._ground_truth[sequence_id] = {} + self._intersections[sequence_id] = {} + self._sequence_length[sequence_id] = 1 + + instance_label = y_true % self._max_instances_per_category + + label_mask = tf.zeros_like(semantic_label, dtype=tf.bool) + prediction_mask = tf.zeros_like(semantic_prediction, dtype=tf.bool) + for things_class_id in self._things_list: + label_mask = tf.logical_or(label_mask, + tf.equal(semantic_label, things_class_id)) + prediction_mask = tf.logical_or( + prediction_mask, tf.equal(semantic_prediction, things_class_id)) + + # Select the `crowd` region of the current class. This region is encoded + # instance id `0`. + is_crowd = tf.logical_and(tf.equal(instance_label, 0), label_mask) + # Select the non-crowd region of the corresponding class as the `crowd` + # region is ignored for the tracking term. + label_mask = tf.logical_and(label_mask, tf.logical_not(is_crowd)) + # Do not punish id assignment for regions that are annotated as `crowd` in + # the ground-truth. + prediction_mask = tf.logical_and(prediction_mask, tf.logical_not(is_crowd)) + + seq_preds = self._predictions[sequence_id] + seq_gts = self._ground_truth[sequence_id] + seq_intersects = self._intersections[sequence_id] + + # Compute and update areas of ground-truth, predictions and intersections. + _update_dict_stats(seq_preds, y_pred[prediction_mask]) + _update_dict_stats(seq_gts, y_true[label_mask]) + + non_crowd_intersection = tf.logical_and(label_mask, prediction_mask) + intersection_ids = ( + y_true[non_crowd_intersection] * self._offset + + y_pred[non_crowd_intersection]) + _update_dict_stats(seq_intersects, intersection_ids) + + def result(self) -> Dict[Text, Any]: + """Computes the segmentation and tracking quality. + + Returns: + A dictionary containing: + - 'STQ': The total STQ score. + - 'AQ': The total association quality (AQ) score. + - 'IoU': The total mean IoU. + - 'STQ_per_seq': A list of the STQ score per sequence. + - 'AQ_per_seq': A list of the AQ score per sequence. + - 'IoU_per_seq': A list of mean IoU per sequence. + - 'Id_per_seq': A list of sequence Ids to map list index to sequence. + - 'Length_per_seq': A list of the length of each sequence. + """ + # Compute association quality (AQ) + num_tubes_per_seq = [0] * len(self._ground_truth) + aq_per_seq = [0] * len(self._ground_truth) + iou_per_seq = [0] * len(self._ground_truth) + id_per_seq = [''] * len(self._ground_truth) + + for index, sequence_id in enumerate(self._ground_truth): + outer_sum = 0.0 + predictions = self._predictions[sequence_id] + ground_truth = self._ground_truth[sequence_id] + intersections = self._intersections[sequence_id] + num_tubes_per_seq[index] = len(ground_truth) + id_per_seq[index] = sequence_id + + for gt_id, gt_size in ground_truth.items(): + inner_sum = 0.0 + for pr_id, pr_size in predictions.items(): + tpa_key = self._offset * gt_id + pr_id + if tpa_key in intersections: + tpa = intersections[tpa_key].numpy() + fpa = pr_size.numpy() - tpa + fna = gt_size.numpy() - tpa + inner_sum += tpa * (tpa / (tpa + fpa + fna)) + + outer_sum += 1.0 / gt_size.numpy() * inner_sum + aq_per_seq[index] = outer_sum + + aq_mean = np.sum(aq_per_seq) / np.maximum(np.sum(num_tubes_per_seq), 1e-15) + aq_per_seq = aq_per_seq / np.maximum(num_tubes_per_seq, 1e-15) + + # Compute IoU scores. + # The rows correspond to ground-truth and the columns to predictions. + # Remove fp from confusion matrix for the void/ignore class. + total_confusion = np.zeros( + (self._confusion_matrix_size, self._confusion_matrix_size), + dtype=np.int64) + for index, confusion in enumerate( + self._iou_confusion_matrix_per_sequence.values()): + confusion = confusion.numpy() + removal_matrix = np.zeros_like(confusion) + removal_matrix[self._include_indices, :] = 1.0 + confusion *= removal_matrix + total_confusion += confusion + + # `intersections` corresponds to true positives. + intersections = confusion.diagonal() + fps = confusion.sum(axis=0) - intersections + fns = confusion.sum(axis=1) - intersections + unions = intersections + fps + fns + + num_classes = np.count_nonzero(unions) + ious = (intersections.astype(np.double) / + np.maximum(unions, 1e-15).astype(np.double)) + iou_per_seq[index] = np.sum(ious) / num_classes + + # `intersections` corresponds to true positives. + intersections = total_confusion.diagonal() + fps = total_confusion.sum(axis=0) - intersections + fns = total_confusion.sum(axis=1) - intersections + unions = intersections + fps + fns + + num_classes = np.count_nonzero(unions) + ious = (intersections.astype(np.double) / + np.maximum(unions, 1e-15).astype(np.double)) + iou_mean = np.sum(ious) / num_classes + + st_quality = np.sqrt(aq_mean * iou_mean) + st_quality_per_seq = np.sqrt(aq_per_seq * iou_per_seq) + return {'STQ': st_quality, + 'AQ': aq_mean, + 'IoU': float(iou_mean), + 'STQ_per_seq': st_quality_per_seq, + 'AQ_per_seq': aq_per_seq, + 'IoU_per_seq': iou_per_seq, + 'ID_per_seq': id_per_seq, + 'Length_per_seq': list(self._sequence_length.values()), + } + + def reset_states(self): + """Resets all states that accumulated data.""" + self._iou_confusion_matrix_per_sequence = collections.OrderedDict() + self._predictions = collections.OrderedDict() + self._ground_truth = collections.OrderedDict() + self._intersections = collections.OrderedDict() + self._sequence_length = collections.OrderedDict() diff --git a/evaluation/segmentation_and_tracking_quality_test.py b/evaluation/segmentation_and_tracking_quality_test.py new file mode 100644 index 0000000000000000000000000000000000000000..4f1a03293ffaf6b900342147ce5f68970ead690f --- /dev/null +++ b/evaluation/segmentation_and_tracking_quality_test.py @@ -0,0 +1,281 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for segmentation_tracking_quality.""" + +import numpy as np +import tensorflow as tf + +from deeplab2.evaluation import segmentation_and_tracking_quality as stq + + +def _compute_metric_and_compare(metric, ground_truth, prediction, + expected_result): + metric.update_state( + tf.convert_to_tensor(ground_truth), tf.convert_to_tensor(prediction), 1) + result = metric.result() + metric.reset_states() + np.testing.assert_almost_equal(result['STQ'], expected_result[0]) + np.testing.assert_almost_equal(result['AQ'], expected_result[1]) + np.testing.assert_almost_equal(result['IoU'], expected_result[2]) + np.testing.assert_almost_equal(result['STQ_per_seq'], [expected_result[0]]) + np.testing.assert_almost_equal(result['AQ_per_seq'], [expected_result[1]]) + np.testing.assert_almost_equal(result['IoU_per_seq'], [expected_result[2]]) + np.testing.assert_almost_equal(result['ID_per_seq'], [1]) + np.testing.assert_almost_equal(result['Length_per_seq'], [1]) + + +class STQualityTest(tf.test.TestCase): + + def test_complex_example(self): + n_classes = 3 + ignore_label = 255 + # classes = ['sky', 'vegetation', 'cars']. + things_list = [2] + max_instances_per_category = 1000 + + ground_truth_semantic_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 2, 0, 1, 1, 1], + [0, 2, 2, 2, 2, 1, 1, 1], + [2, 2, 2, 2, 2, 2, 1, 1], + [2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 1, 1]]) + ground_truth_semantic_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 2, 0, 0, 1, 1, 0, 0], + [2, 2, 2, 1, 1, 1, 1, 0], + [2, 2, 2, 2, 1, 1, 1, 1], + [2, 2, 2, 2, 2, 1, 1, 1], + [2, 2, 2, 2, 2, 1, 1, 1], + [2, 2, 2, 2, 1, 1, 1, 1]]) + ground_truth_semantic_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [2, 0, 1, 1, 1, 0, 0, 0], + [2, 2, 1, 1, 1, 1, 0, 0], + [2, 2, 2, 1, 1, 1, 1, 0], + [2, 2, 2, 1, 1, 1, 1, 1], + [2, 2, 2, 1, 1, 1, 1, 1]]) + ground_truth_semantic = np.stack([ + ground_truth_semantic_1, ground_truth_semantic_2, + ground_truth_semantic_3 + ]) + + ground_truth_instance_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 2, 0, 0, 0, 0], + [0, 2, 2, 2, 2, 0, 0, 0], + [2, 2, 2, 2, 2, 2, 0, 0], + [2, 2, 2, 2, 2, 2, 2, 0], + [2, 2, 2, 2, 2, 2, 2, 0], + [2, 2, 2, 2, 2, 2, 0, 0]]) + ground_truth_instance_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 2, 0, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0], + [2, 2, 2, 2, 0, 0, 0, 0], + [2, 2, 2, 2, 2, 0, 0, 0], + [2, 2, 2, 2, 2, 0, 0, 0], + [2, 2, 2, 2, 0, 0, 0, 0]]) + ground_truth_instance_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [2, 0, 0, 0, 0, 0, 0, 0], + [2, 2, 0, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0]]) + + ground_truth_instance = np.stack([ + ground_truth_instance_1, ground_truth_instance_2, + ground_truth_instance_3 + ]) + ground_truth = (ground_truth_semantic * max_instances_per_category + + ground_truth_instance) + + prediction_semantic_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 2, 2, 1, 1, 1], + [0, 2, 2, 2, 2, 2, 1, 1], + [2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 2, 1], + [2, 2, 2, 2, 2, 2, 2, 1]]) + prediction_semantic_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 1, 0, 0], + [0, 2, 2, 2, 1, 1, 1, 1], + [2, 2, 2, 2, 1, 1, 1, 1], + [2, 2, 2, 2, 2, 1, 1, 1], + [2, 2, 2, 2, 2, 2, 1, 1], + [2, 2, 2, 2, 2, 1, 1, 1]]) + prediction_semantic_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0], + [0, 0, 1, 1, 1, 1, 0, 0], + [2, 2, 2, 1, 1, 1, 0, 0], + [2, 2, 2, 1, 1, 1, 1, 1], + [2, 2, 2, 2, 1, 1, 1, 1], + [2, 2, 2, 2, 1, 1, 1, 1]]) + prediction_semantic = np.stack( + [prediction_semantic_1, prediction_semantic_2, prediction_semantic_3]) + + prediction_instance_1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 2, 2, 0, 0, 0], + [0, 2, 2, 2, 2, 1, 0, 0], + [2, 2, 2, 2, 2, 1, 1, 0], + [2, 2, 2, 2, 1, 1, 1, 0], + [2, 2, 2, 2, 1, 1, 1, 0]]) + prediction_instance_2 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 2, 2, 2, 0, 0, 0, 0], + [2, 2, 2, 2, 0, 0, 0, 0], + [2, 2, 2, 2, 2, 0, 0, 0], + [2, 2, 2, 2, 1, 1, 0, 0], + [2, 2, 2, 2, 1, 0, 0, 0]]) + prediction_instance_3 = np.array([[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0], + [2, 2, 2, 0, 0, 0, 0, 0], + [2, 2, 2, 2, 0, 0, 0, 0], + [2, 2, 2, 2, 0, 0, 0, 0]]) + prediction_instance = np.stack( + [prediction_instance_1, prediction_instance_2, prediction_instance_3]) + prediction = (prediction_semantic * max_instances_per_category + + prediction_instance) + + # Compute STQuality. + stq_metric = stq.STQuality( + n_classes, things_list, ignore_label, max_instances_per_category, + 256 * 256) + + for i in range(3): + stq_metric.update_state( + tf.convert_to_tensor(ground_truth[i, ...], dtype=tf.int32), + tf.convert_to_tensor(prediction[i, ...], dtype=tf.int32), + 1) + + result = stq_metric.result() + + np.testing.assert_almost_equal(result['STQ'], 0.66841773352) + np.testing.assert_almost_equal(result['AQ'], 0.55366581415) + np.testing.assert_almost_equal(result['IoU'], 0.8069529580309542) + np.testing.assert_almost_equal(result['STQ_per_seq'], [0.66841773352]) + np.testing.assert_almost_equal(result['AQ_per_seq'], [0.55366581415]) + np.testing.assert_almost_equal(result['IoU_per_seq'], [0.8069529580309542]) + np.testing.assert_almost_equal(result['ID_per_seq'], [1]) + np.testing.assert_almost_equal(result['Length_per_seq'], [3]) + + def test_basic_examples(self): + n_classes = 2 + ignore_label = 255 + # classes = ['cars', 'sky']. + things_list = [0] + max_instances_per_category = 1000 + + # Since the semantic label is `0`, the instance ID is enough. + ground_truth_track = np.array([[1, 1, 1, 1, 1]]) + + stq_metric = stq.STQuality( + n_classes, things_list, ignore_label, max_instances_per_category, + 256 * 256) + + with self.subTest('Example 0'): + predicted_track = np.array([[1, 1, 1, 1, 1]]) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, [1.0, 1.0, 1.0]) + + with self.subTest('Example 1'): + predicted_track = np.array([[1, 1, 2, 2, 2]]) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, [0.72111026, 0.52, 1.0]) + + with self.subTest('Example 2'): + predicted_track = np.array([[1, 2, 2, 2, 2]]) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, [0.82462113, 0.68, 1.0]) + + with self.subTest('Example 3'): + predicted_track = np.array([[1, 2, 3, 4, 5]]) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, [0.447213596, 0.2, 1.0]) + + with self.subTest('Example 4'): + predicted_track = np.array([[1, 2, 1, 2, 2]]) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, [0.72111026, 0.52, 1.0]) + + with self.subTest('Example 5'): + predicted_track = ( + np.array([[0, 1, 1, 1, 1]]) + + np.array([[1, 0, 0, 0, 0]]) * max_instances_per_category) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, [0.50596443, 0.64, 0.4]) + + # First label is `crowd`. + ground_truth_track = np.array([[0, 1, 1, 1, 1, 1]]) + + with self.subTest('Example 6'): + predicted_track = np.array([[1, 1, 1, 1, 1, 1]]) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, [1.0, 1.0, 1.0]) + + with self.subTest('Example 7'): + predicted_track = np.array([[2, 2, 2, 2, 1, 1]]) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, [0.72111026, 0.52, 1.0]) + + with self.subTest('Example 8'): + predicted_track = ( + np.array([[2, 2, 0, 1, 1, 1]]) + + np.array([[0, 0, 1, 0, 0, 0]]) * max_instances_per_category) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, + [0.40824829, 0.4, 5.0 / 12.0]) + + # First label is `sky`. + ground_truth_track = ( + np.array([[0, 1, 1, 1, 1]]) + + np.array([[1, 0, 0, 0, 0]]) * max_instances_per_category) + + with self.subTest('Example 9'): + predicted_track = np.array([[1, 1, 1, 1, 1]]) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, [0.56568542, 0.8, 0.4]) + + with self.subTest('Example 10'): + predicted_track = np.array([[2, 2, 2, 1, 1]]) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, + [0.42426407, 0.45, 0.4]) + + with self.subTest('Example 11'): + predicted_track = ( + np.array([[2, 2, 0, 1, 1]]) + + np.array([[0, 0, 1, 0, 0]]) * max_instances_per_category) + _compute_metric_and_compare(stq_metric, ground_truth_track, + predicted_track, + [0.3, 0.3, 0.3]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/evaluation/test_utils.py b/evaluation/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0308eb618688c761a60b7a6bc07d0281dcbace70 --- /dev/null +++ b/evaluation/test_utils.py @@ -0,0 +1,121 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions to set up unit tests on Panoptic Segmentation code.""" + +import os +from typing import Mapping, Optional, Tuple + +from absl import flags +import numpy as np +from PIL import Image + +import tensorflow as tf + +FLAGS = flags.FLAGS + +_TEST_DATA_DIR = ('deeplab2/' + 'evaluation/testdata') + + +def read_test_image(testdata_path: str, + image_format: Optional[str] = None) -> np.ndarray: + """Loads a test image. + + Args: + testdata_path: Image path relative to panoptic_segmentation/testdata as a + string. + image_format: Format of the image. Can be one of 'RGBA', 'RGB', or 'L'. + + Returns: + The image, as a numpy array. + """ + image_path = os.path.join(_TEST_DATA_DIR, testdata_path) + with tf.io.gfile.GFile(image_path, 'rb') as f: + image = Image.open(f) + if image_format is not None: + image = image.convert(image_format) + return np.array(image) + + +def read_segmentation_with_rgb_color_map( + image_testdata_path: str, + rgb_to_semantic_label: Mapping[Tuple[int, int, int], int], + output_dtype: Optional[np.dtype] = None) -> np.ndarray: + """Reads a test segmentation as an image and a map from colors to labels. + + Args: + image_testdata_path: Image path relative to panoptic_segmentation/testdata + as a string. + rgb_to_semantic_label: Mapping from RGB colors to integer labels as a + dictionary. + output_dtype: Type of the output labels. If None, defaults to the type of + the provided color map. + + Returns: + A 2D numpy array of labels. + + Raises: + ValueError: On an incomplete `rgb_to_semantic_label`. + """ + rgb_image = read_test_image(image_testdata_path, image_format='RGB') + if len(rgb_image.shape) != 3 or rgb_image.shape[2] != 3: + raise AssertionError('Expected RGB image, actual shape is %s' % + (rgb_image.shape,)) + + num_pixels = rgb_image.shape[0] * rgb_image.shape[1] + unique_colors = np.unique(np.reshape(rgb_image, [num_pixels, 3]), axis=0) + if not set(map(tuple, unique_colors)).issubset(rgb_to_semantic_label.keys()): + raise ValueError('RGB image has colors not in color map.') + + output_dtype = output_dtype or type( + next(iter(rgb_to_semantic_label.values()))) + output_labels = np.empty(rgb_image.shape[:2], dtype=output_dtype) + for rgb_color, int_label in rgb_to_semantic_label.items(): + color_array = np.array(rgb_color, ndmin=3) + output_labels[np.all(rgb_image == color_array, axis=2)] = int_label + return output_labels + + +def panoptic_segmentation_with_class_map( + instance_testdata_path: str, instance_label_to_semantic_label: Mapping[int, + int] +) -> Tuple[np.ndarray, np.ndarray]: + """Reads in a panoptic segmentation with an instance map and a map to classes. + + Args: + instance_testdata_path: Path to a grayscale instance map, given as a string + and relative to panoptic_segmentation/testdata. + instance_label_to_semantic_label: A map from instance labels to class + labels. + + Returns: + A tuple `(instance_labels, class_labels)` of numpy arrays. + + Raises: + ValueError: On a mismatched set of instances in + the + `instance_label_to_semantic_label`. + """ + instance_labels = read_test_image(instance_testdata_path, image_format='L') + if set(np.unique(instance_labels)) != set( + instance_label_to_semantic_label.keys()): + raise ValueError('Provided class map does not match present instance ids.') + + class_labels = np.empty_like(instance_labels) + for instance_id, class_id in instance_label_to_semantic_label.items(): + class_labels[instance_labels == instance_id] = class_id + + return instance_labels, class_labels diff --git a/evaluation/test_utils_test.py b/evaluation/test_utils_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0bdb32281b2c65dbfe1e7e875c59f7a5a13acb0f --- /dev/null +++ b/evaluation/test_utils_test.py @@ -0,0 +1,67 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for test_utils.""" +import numpy as np +import tensorflow as tf + +from deeplab2.evaluation import test_utils + + +class TestUtilsTest(tf.test.TestCase): + + def test_read_test_image(self): + image_array = test_utils.read_test_image('team_pred_class.png') + self.assertSequenceEqual(image_array.shape, (231, 345, 4)) + + def test_reads_segmentation_with_color_map(self): + rgb_to_semantic_label = {(0, 0, 0): 0, (0, 0, 255): 1, (255, 0, 0): 23} + labels = test_utils.read_segmentation_with_rgb_color_map( + 'team_pred_class.png', rgb_to_semantic_label) + + input_image = test_utils.read_test_image('team_pred_class.png') + np.testing.assert_array_equal( + labels == 0, + np.logical_and(input_image[:, :, 0] == 0, input_image[:, :, 2] == 0)) + np.testing.assert_array_equal(labels == 1, input_image[:, :, 2] == 255) + np.testing.assert_array_equal(labels == 23, input_image[:, :, 0] == 255) + + def test_reads_gt_segmentation(self): + instance_label_to_semantic_label = { + 0: 0, + 47: 1, + 97: 1, + 133: 1, + 150: 1, + 174: 1, + 198: 23, + 215: 1, + 244: 1, + 255: 1, + } + instances, classes = test_utils.panoptic_segmentation_with_class_map( + 'team_gt_instance.png', instance_label_to_semantic_label) + + expected_label_shape = (231, 345) + self.assertSequenceEqual(instances.shape, expected_label_shape) + self.assertSequenceEqual(classes.shape, expected_label_shape) + np.testing.assert_array_equal(instances == 0, classes == 0) + np.testing.assert_array_equal(instances == 198, classes == 23) + np.testing.assert_array_equal( + np.logical_and(instances != 0, instances != 198), classes == 1) + + +if __name__ == '__main__': + tf.test.main() diff --git a/evaluation/testdata/README.md b/evaluation/testdata/README.md new file mode 100644 index 0000000000000000000000000000000000000000..37927ec0faf0f1151758df5f0bd68bf7406f1b2e --- /dev/null +++ b/evaluation/testdata/README.md @@ -0,0 +1,11 @@ +# Segmentation Evalaution Test Data + +## Source Images + +* [team_input.png](team_input.png) \ + Source: + https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html +* [cat_input.jpg](cat_input.jpg) \ + Source: https://www.flickr.com/photos/magdalena_b/4995858743 +* [bird_input.jpg](bird_input.jpg) \ + Source: https://www.flickr.com/photos/chivinskia/40619099560 diff --git a/evaluation/testdata/bird_gt.png b/evaluation/testdata/bird_gt.png new file mode 100644 index 0000000000000000000000000000000000000000..05d854915d1809abe3ba10f03c20e75706e0bb17 Binary files /dev/null and b/evaluation/testdata/bird_gt.png differ diff --git a/evaluation/testdata/bird_pred_class.png b/evaluation/testdata/bird_pred_class.png new file mode 100644 index 0000000000000000000000000000000000000000..07351bf061115d0990486cbb086b6b9ec53e691b Binary files /dev/null and b/evaluation/testdata/bird_pred_class.png differ diff --git a/evaluation/testdata/bird_pred_instance.png b/evaluation/testdata/bird_pred_instance.png new file mode 100644 index 0000000000000000000000000000000000000000..faa1371f52510fb6f15fecb0eecc3441b2c8eadb Binary files /dev/null and b/evaluation/testdata/bird_pred_instance.png differ diff --git a/evaluation/testdata/cat_gt.png b/evaluation/testdata/cat_gt.png new file mode 100644 index 0000000000000000000000000000000000000000..41f60111f3de899a9e1ca3a646bea72d86b3009f Binary files /dev/null and b/evaluation/testdata/cat_gt.png differ diff --git a/evaluation/testdata/cat_pred_class.png b/evaluation/testdata/cat_pred_class.png new file mode 100644 index 0000000000000000000000000000000000000000..3728c68ced20312567e70540b667b53269000318 Binary files /dev/null and b/evaluation/testdata/cat_pred_class.png differ diff --git a/evaluation/testdata/cat_pred_instance.png b/evaluation/testdata/cat_pred_instance.png new file mode 100644 index 0000000000000000000000000000000000000000..ebd9ba4855f5c88a3b336d50e21d864a37175bbe Binary files /dev/null and b/evaluation/testdata/cat_pred_instance.png differ diff --git a/evaluation/testdata/team_gt_instance.png b/evaluation/testdata/team_gt_instance.png new file mode 100644 index 0000000000000000000000000000000000000000..97abb55273ce409a5fbaa85cb999f0725d457dbf Binary files /dev/null and b/evaluation/testdata/team_gt_instance.png differ diff --git a/evaluation/testdata/team_pred_class.png b/evaluation/testdata/team_pred_class.png new file mode 100644 index 0000000000000000000000000000000000000000..2ed78de2cbd923e6530f08fc2c47bf8377cfaf69 Binary files /dev/null and b/evaluation/testdata/team_pred_class.png differ diff --git a/evaluation/testdata/team_pred_instance.png b/evaluation/testdata/team_pred_instance.png new file mode 100644 index 0000000000000000000000000000000000000000..264606a4d8822108481132ff9e990d826c64a274 Binary files /dev/null and b/evaluation/testdata/team_pred_instance.png differ diff --git a/evaluation/video_panoptic_quality.py b/evaluation/video_panoptic_quality.py new file mode 100644 index 0000000000000000000000000000000000000000..02294e6ac56ac4c3a704445e266d874eedf1cf57 --- /dev/null +++ b/evaluation/video_panoptic_quality.py @@ -0,0 +1,98 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implementation of the Video Panoptic Quality metric. + +Video Panoptic Quality is an instance-based metric for evaluating the task +of video panoptic segmentation. +Please see the paper for details: +Dahun Kim, Sanghyun Woo, Joon-Young Lee, and In So Kweon. +"Video panoptic segmentation." In CVPR, 2020. +""" + +from typing import List, Tuple + +import numpy as np +import tensorflow as tf +from deeplab2.evaluation import panoptic_quality + + +class VideoPanopticQuality(panoptic_quality.PanopticQuality): + """Metric class for Video Panoptic Quality. + + Dahun Kim, Sanghyun Woo, Joon-Young Lee, and In So Kweon. + "Video panoptic segmentation." In CVPR, 2020. + + Video Panoptic Quality can be modeled as Image Panoptic Quality with the + sequences of predictions and the ground-truth labels horizontally concatenated + as two images, separately. Therefore, this class inherits the image panoptic + quality class and changes the implementation to concatenated comparisons. + + Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen. + "ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic + Segmentation." In CVPR, 2021. + + Stand-alone usage: + vpq_obj = video_panoptic_quality.VideoPanopticQuality( + num_classes, max_instances_per_category, ignored_label) + vpq_obj.update_state(y_true_1, y_pred_1) + vpq_obj.update_state(y_true_2, y_pred_2) + ... + result = vpq_obj.result().numpy() + """ + + def __init__(self, + num_classes: int, + ignored_label: int, + max_instances_per_category: int, + offset: int, + name: str = 'video_panoptic_quality', + **kwargs): + """Initialization of the VideoPanopticQuality metric. + + Args: + num_classes: Number of classes in the dataset as an integer. + ignored_label: The class id to be ignored in evaluation as an integer or + integer tensor. + max_instances_per_category: The maximum number of instances for each class + as an integer or integer tensor. + offset: The maximum number of unique labels as an integer or integer + tensor. + name: An optional variable_scope name. (default: 'video_panoptic_quality') + **kwargs: The keyword arguments that are passed on to `fn`. + """ + super().__init__(num_classes, ignored_label, max_instances_per_category, + offset, name, **kwargs) + + def compare_and_accumulate( + self, gt_panoptic_labels: List[tf.Tensor], + pred_panoptic_labels: List[tf.Tensor] + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Compares predicted segmentation with groundtruth, accumulates its metric. + + Args: + gt_panoptic_labels: A list of tensors for the ground-truth + video panoptic segmentation labels. + pred_panoptic_labels: A list of tensors for video panoptic + segmentation predictions. + + Returns: + The value of the metrics (iou, tp, fn, fp) over all comparisons, as a + float scalar. + """ + gt_panoptic_label = tf.concat(gt_panoptic_labels, axis=1) + pred_panoptic_label = tf.concat(pred_panoptic_labels, axis=1) + return super(VideoPanopticQuality, self).compare_and_accumulate( + gt_panoptic_label, pred_panoptic_label) diff --git a/evaluator.proto b/evaluator.proto new file mode 100644 index 0000000000000000000000000000000000000000..e0d72a2ba1b692caea9109f162af5d994c4303f8 --- /dev/null +++ b/evaluator.proto @@ -0,0 +1,95 @@ +// Copyright 2021 The Deeplab2 Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package deeplab2; + +// Next ID: 22 +message EvaluatorOptions { + // Set the number of steps to run evaluation. -1 corresponds to a run over the + // full dataset. + optional int32 eval_steps = 1 [default = -1]; + // Set the number of train steps after which eval should run in interleaved + // mode. + optional int32 eval_interval = 2 [default = 5000]; + // Set the number of seconds to wait at most for the next checkpoint. -1 means + // the job will wait forever. + optional int32 continuous_eval_timeout = 3 [default = -1]; + // Set whether to run evaluation as a tf function. + optional bool use_tf_function = 4 [default = true]; + // Set the area size of stuff segments to discard. + optional int32 stuff_area_limit = 6 [default = 0]; + // Set the area size of thing segments to discard (set to ignore_label). Note + // that this option is currently only supported in MaX-DeepLab. + optional int32 thing_area_limit = 19 [default = 0]; + // Set the threshold for the transformer class confidence. + optional float transformer_class_confidence_threshold = 20 [default = 0.7]; + // Set the threshold for the per-pixel mask confidence. Note that this option + // is currently only supported in MaX-DeepLab. + optional float pixel_confidence_threshold = 21 [default = 0.4]; + // Set the threshold of the center heatmap for post-processing. + optional float center_score_threshold = 7 [default = 0.1]; + // Set the kernel size of the nms kernel for the center heatmap. + optional int32 nms_kernel = 8 [default = 3]; + // Set the number of top centers to keep. -1 corresponds to keeping all + // centers. + optional int32 keep_k_centers = 9 [default = 400]; + // Enable saving predictions to disk. + optional bool save_predictions = 10 [default = false]; + // Override the storing location. By default, predictions are written to + // `experiment_root` + `experiment_name` + `vis`. + optional string override_save_dir = 11; + // Set the number of samples to visualize. + optional int32 num_vis_samples = 12 [default = 10]; + // Enable saving raw predictions for the whole dataset. The output path is the + // save_dir + `raw_semantic`/`raw_panoptic`. + optional bool save_raw_predictions = 13 [default = false]; + // The format of raw panoptic predictions. This flag is used together with + // `save_raw_predictions`. When save_raw_predictions is True, this field + // specifies the format of saved raw panoptic predictions. Supports: + // - 'two_channel_png': The popular format, also supported by the official + // COCO panoptic API (https://github.com/cocodataset/panopticapi), where + // the saved PNG image contains R-channel for semantic labels and + // G-channel for instance IDs. + // - 'three_channel_png': A simple extension of the 'two_channel_png' format, + // and is adopted in some video panoptic segmentation datasets (for + // example, KITTI-STEP and MOTChallenge-STEP), where the saved PNG image + // contains R-channel for semantic labels, G-channel for the values of + // (instance ID // 256), and B-channel for (instance ID % 256). + // - 'two_channel_numpy_array': A more flexible format (unconstrained by the + // PNG channel size), where the panoptic predictions are saved as a numpy + // array in the two channel format (i.e., first channel encodes the + // semantic class and the second channel the instance ID). + optional string raw_panoptic_format = 17 [default = 'two_channel_png']; + // Enable conversion of train IDs to eval IDs for raw predictions. + optional bool convert_raw_to_eval_ids = 14 [default = true]; + // Add flipped images for evaluation or not. This is used for multi-scale + // inference (usually used together with `eval_scales`). If True, another + // flipped image will be used during inference. + optional bool add_flipped_images = 5 [default = false]; + // The scales to resize images for inference. Change it to, e.g. [0.5, 0.75, + // 1.0, 1.25, 1.5, 1.75], for multi-scale inference. + repeated float eval_scales = 15 [packed = true]; + // Boolean, if true, use TensorFlow operation (CUDA kernel) to merge + // semantic and instance segmentation (for the final panoptic segmentation). + // Defaults to true, as our GPU implementation is much faster. Set to false + // if you could not successfully compile TensorFlow with this operation. + optional bool merge_semantic_and_instance_with_tf_op = 16 [default = true]; + // Displays detailed metrics on instance segmentation AP. This includes e.g. + // AP at a matching IoU threshold of 0.5, or the AP of small objects only, + // etc. If false, will only display a summary AP metric that's an average of + // IoU thresholds and over all objects. + optional bool detailed_ap_metrics = 18 [default = false]; +} diff --git a/export_model.py b/export_model.py new file mode 100644 index 0000000000000000000000000000000000000000..176721bfe4d370686ed45d2a658c7948f75c64f0 --- /dev/null +++ b/export_model.py @@ -0,0 +1,157 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Script to export deeplab model to saved model.""" + +import functools +from typing import Any, MutableMapping, Sequence, Text + +from absl import app +from absl import flags +import tensorflow as tf + +from google.protobuf import text_format +from deeplab2 import config_pb2 +from deeplab2.data import dataset +from deeplab2.data.preprocessing import input_preprocessing +from deeplab2.model import utils +from deeplab2.trainer import train_lib + + +_FLAGS_EXPERIMENT_OPTION_PATH = flags.DEFINE_string( + 'experiment_option_path', + default='', + help='Path to the experiment option text proto.') + +_FLAGS_CKPT_PATH = flags.DEFINE_string( + 'checkpoint_path', + default='', + help='Path to the saved checkpoint.') + +_FLAGS_OUTPUT_PATH = flags.DEFINE_string( + 'output_path', + default='', + help='Output directory path for the exported saved model.') + +_FLAGS_MERGE_WITH_TF_OP = flags.DEFINE_boolean( + 'merge_with_tf_op', + default=False, + help='Whether to use customized TF op for merge semantic and instance ' + 'predictions. Set it to True to reproduce the numbers as reported in ' + 'paper, but the saved model would require specifically compiled TensorFlow ' + 'to run.') + + +class DeepLabModule(tf.Module): + """Class that runs DeepLab inference end-to-end.""" + + def __init__(self, config: config_pb2.ExperimentOptions, ckpt_path: Text, + use_tf_op: bool = False): + super().__init__(name='DeepLabModule') + + dataset_options = config.eval_dataset_options + dataset_name = dataset_options.dataset + crop_height, crop_width = dataset_options.crop_size + + config.evaluator_options.merge_semantic_and_instance_with_tf_op = use_tf_op + # Disable drop path and recompute grad as they are only used in training. + config.model_options.backbone.drop_path_keep_prob = 1.0 + + deeplab_model = train_lib.create_deeplab_model( + config, + dataset.MAP_NAME_TO_DATASET_INFO[dataset_name]) + self._is_motion_deeplab = ( + config.model_options.WhichOneof('meta_architecture') == + 'motion_deeplab') + + # For now we only support batch size of 1 for saved model. + input_shape = train_lib.build_deeplab_model( + deeplab_model, (crop_height, crop_width), batch_size=1) + self._input_depth = input_shape[-1] + + checkpoint = tf.train.Checkpoint(**deeplab_model.checkpoint_items) + # Not all saved variables (e.g. variables from optimizer) will be restored. + # `expect_partial()` to suppress the warning. + checkpoint.restore(ckpt_path).expect_partial() + self._model = deeplab_model + + self._preprocess_fn = functools.partial( + input_preprocessing.preprocess_image_and_label, + label=None, + crop_height=crop_height, + crop_width=crop_width, + prev_label=None, + min_resize_value=dataset_options.min_resize_value, + max_resize_value=dataset_options.max_resize_value, + resize_factor=dataset_options.resize_factor, + is_training=False) + + def get_input_spec(self): + """Returns TensorSpec of input tensor needed for inference.""" + # We expect a single 3D, uint8 tensor with shape [height, width, channels]. + return tf.TensorSpec(shape=[None, None, self._input_depth], dtype=tf.uint8) + + @tf.function + def __call__(self, input_tensor: tf.Tensor) -> MutableMapping[Text, Any]: + """Performs a forward pass. + + Args: + input_tensor: An uint8 input tensor of type tf.Tensor with shape [height, + width, channels]. + + Returns: + A dictionary containing the results of the specified DeepLab architecture. + The results are bilinearly upsampled to input size before returning. + """ + input_size = [tf.shape(input_tensor)[0], tf.shape(input_tensor)[1]] + + if self._is_motion_deeplab: + # For motion deeplab, split the input tensor to current and previous + # frame before preprocessing, and re-assemble them. + image, prev_image = tf.split(input_tensor, 2, axis=2) + (resized_image, processed_image, _, processed_prev_image, + _) = self._preprocess_fn(image=image, prev_image=prev_image) + processed_image = tf.concat( + [processed_image, processed_prev_image], axis=2) + else: + (resized_image, processed_image, _, _, _) = self._preprocess_fn( + image=input_tensor) + + resized_size = tf.shape(resized_image)[0:2] + # Making input tensor to 4D to fit model input requirements. + outputs = self._model(tf.expand_dims(processed_image, 0), training=False) + # We only undo-preprocess for those defined in tuples in model/utils.py. + return utils.undo_preprocessing(outputs, resized_size, + input_size) + + +def main(argv: Sequence[str]) -> None: + if len(argv) > 1: + raise app.UsageError('Too many command-line arguments.') + + config = config_pb2.ExperimentOptions() + with tf.io.gfile.GFile(_FLAGS_EXPERIMENT_OPTION_PATH.value, 'r') as f: + text_format.Parse(f.read(), config) + + module = DeepLabModule( + config, _FLAGS_CKPT_PATH.value, _FLAGS_MERGE_WITH_TF_OP.value) + + signatures = module.__call__.get_concrete_function(module.get_input_spec()) + tf.saved_model.save( + module, _FLAGS_OUTPUT_PATH.value, signatures=signatures) + + +if __name__ == '__main__': + app.run(main) diff --git a/g3doc/change_logs.md b/g3doc/change_logs.md new file mode 100644 index 0000000000000000000000000000000000000000..339995cd2c82674d225d595e090c19206f73092a --- /dev/null +++ b/g3doc/change_logs.md @@ -0,0 +1,6 @@ +# Change logs + +* June 7th, 2021: Add hungarian matching support on TPU for MaX-DeepLab. Our + TF2 version is based on Jiquan Ngiam's original Lingvo tensorflow + implementation and Amil Merchant's TF1 version modifications. +* June 1st, 2021: "Hello, World!", DeepLab2 made publicly available. diff --git a/g3doc/faq.md b/g3doc/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..bd9d72274bd08525e197649ccdd412a688128bb6 --- /dev/null +++ b/g3doc/faq.md @@ -0,0 +1,98 @@ +# FAQ + +________________________________________________________________________________ + +**Q1: What should I do if I encounter OOM (out-of-memory) while training the +models?** + +**A1**: To avoid OOM, you could try: + +1. reducing the training crop size (i.e., the flag `crop_size` in + `train_dataset_options`, and see Q2 for more details), which reduces the + input size during training, + +2. using a larger output stride (e.g., 32) in the backbone (i.e., the flag + `output_stride` in `model_options`, and see Q3 for more details), which + reduces the usage of atrous convolution, + +3. using a smaller backbone, such as ResNet-50. + +________________________________________________________________________________ + +**Q2: What is the `crop_size` I need to set?** + +**A2**: DeepLab framework always uses `crop_size` equal to `output_stride` * k + +1, where k is an integer. + +* During inference/evaluation, since DeepLab framework uses whole-image + inference, we need to set k so that the resulting `crop_size` (in + `eval_dataset_options`) is slightly larger the largest image dimension in + the dataset. For example, we set eval_crop_size = 1025x2049 for Cityscapes + images whose image dimension is all equal to 1024x2048. + +* During training, we could set k to be any integer as long as it fits to your + device memory. However, we notice a better performance when we have the same + `crop_size` during training and evaluation (i.e., also use whole-image crop + size during training). + +________________________________________________________________________________ + +**Q3: What output stride should I use in the backbone?** + +**A3**: Using a different output stride leads to a different accuracy-and-memory +trade-off. For example, DeepLabv1 uses output stride = 8, but it requires a lot +of device memory. In DeepLabv3+ paper, we found that using output stride = 16 +strikes the best accuracy-and-memory trade-off, which is then our default +setting. If you wish to further reduce the memory usage, you could set output +stride to 32. Additionally, we suggest adjusting the `atrous_rates` in the ASPP +module as follows. + +* If `backbone.output_stride` = 32, use `atrous_rates` = [3, 6, 9]. + +* If `backbone.output_stride` = 16, use `atrous_rates` = [6, 12, 18]. + +* If `backbone.output_stride` = 8, use `atrous_rates` = [12, 24, 36]. + +Note that these settings may not be optimal. You may need to adjust them to +better fit your dataset. + +________________________________________________________________________________ + +**Q4: Why are the results reported by the provided evaluation code slightly +different from the official evaluation code (e.g., +[Cityscapes](https://github.com/mcordts/cityscapesScripts))?** + +**A4**: In order to run everything end-to-end in the TensorFlow system (e.g., +the on-line evaluation during training), we re-implemented the evaluation codes +in TensorFlow. Additionally, our whole system, including the training and +evaluation pipelines, uses the panoptic label format (i.e., `panoptic_label = +semantic_label * label_divisor + instance_id`, where the `label_divisor` should +be larger than the maximum number of instances per image), instead of the JSON +[COCO formats](https://cocodataset.org/#format-data). These two changes along +with rounding and similar issues result in some minor differences. Therefore, +our re-implemented evaluation code is mainly used for TensorFlow integration +(e.g., the support of on-line evaluation in TensorBoard). The users should run +the corresponding official evaluation code in order to compare with other +published papers. Note that all the reported numbers in our papers are evaluated +with the official evaluation code. + +To facilitate the conversion between prediction formats, we also provide +instructions for running the official evaluation codes on +[Cityscapes](setup/cityscapes_test_server_evaluation.md) and +[COCO](setup/coco_test_server_evaluation.md). + +________________________________________________________________________________ + +**Q5: What should I do, if I could not manage to compile TensorFlow along with +the provided efficient merging operation `merge_semantic_and_instance_maps`?** + +**A5**: In this case, we provide another fallback solution, which implements the +merging operation with pure tf functions. This fallback solution does not +require any TensorFlow compilation. However, note that compared to our provided +TensorFlow merging operation `merge_semantic_and_instance_maps`, its inference +speed is slower and the resulting segmentation performance may also be slightly +lower. + +To use the pure-tf-function version of `merge_semantic_and_instance_maps`, set +`merge_semantic_instance_with_tf_op` to `false` in your config's +`evaluator_options`. diff --git a/g3doc/img/axial_deeplab/axial_block.png b/g3doc/img/axial_deeplab/axial_block.png new file mode 100644 index 0000000000000000000000000000000000000000..1126fa1df6ecafd6f894de426093af588854d7b3 Binary files /dev/null and b/g3doc/img/axial_deeplab/axial_block.png differ diff --git a/g3doc/img/axial_deeplab/nonlocal_block.png b/g3doc/img/axial_deeplab/nonlocal_block.png new file mode 100644 index 0000000000000000000000000000000000000000..d0fd31722fe452e1463e5bbe426544c142877b5e Binary files /dev/null and b/g3doc/img/axial_deeplab/nonlocal_block.png differ diff --git a/g3doc/img/axial_deeplab/position_sensitive_axial_block.png b/g3doc/img/axial_deeplab/position_sensitive_axial_block.png new file mode 100644 index 0000000000000000000000000000000000000000..812f33f192e857ebb6be2aed84adddeec7578cf5 Binary files /dev/null and b/g3doc/img/axial_deeplab/position_sensitive_axial_block.png differ diff --git a/g3doc/img/max_deeplab/overview.png b/g3doc/img/max_deeplab/overview.png new file mode 100644 index 0000000000000000000000000000000000000000..cee68f7fb42ca1800975e9901221fc2c933f3d2f Binary files /dev/null and b/g3doc/img/max_deeplab/overview.png differ diff --git a/g3doc/img/max_deeplab/overview_simple.png b/g3doc/img/max_deeplab/overview_simple.png new file mode 100644 index 0000000000000000000000000000000000000000..c5693bfca5eb3fb08f1018a4a3e150e7daa28d19 Binary files /dev/null and b/g3doc/img/max_deeplab/overview_simple.png differ diff --git a/g3doc/img/panoptic_deeplab.png b/g3doc/img/panoptic_deeplab.png new file mode 100644 index 0000000000000000000000000000000000000000..31194fd257ae14b575a96556ad80af5e2e96593c Binary files /dev/null and b/g3doc/img/panoptic_deeplab.png differ diff --git a/g3doc/img/step/kitti_step_annotation.png b/g3doc/img/step/kitti_step_annotation.png new file mode 100644 index 0000000000000000000000000000000000000000..4793a78ea703eb81b257aadcf899b53641ee4f96 Binary files /dev/null and b/g3doc/img/step/kitti_step_annotation.png differ diff --git a/g3doc/projects/axial_deeplab.md b/g3doc/projects/axial_deeplab.md new file mode 100644 index 0000000000000000000000000000000000000000..c99a064659501e57e6b2e595236ea7299c73c9d5 --- /dev/null +++ b/g3doc/projects/axial_deeplab.md @@ -0,0 +1,168 @@ +# Axial-DeepLab + +Axial-DeepLab, improving over Panoptic-DeepLab, incorporates the powerful +axial self-attention modules [1], also known as the encoder of Axial +Transformers [2], for general dense prediction tasks. In this document, +we demonstrate the effectiveness of Axial-DeepLab on the task of panoptic +segmentation [6], unifying semantic segmentation and instance segmentation. + +To reduce the computation complexity of 2D self-attention (especially +prominent for dense pixel prediction tasks) and further to allow us to +perform attention witin a larger or even global region, we factorize the 2D +self-attention [1, 3, 4] into **two** 1D self-attention [2, 5]. We then +effectively integrate the **axial-attention** into a residual block [7], as +illustrated in Fig. 1. + +
+
+
+ Figure 1. An axial-attention (residual) block, which consists of two
+ axial-attention layers operating along height- and width-axis
+ sequentially.
+
+
+
+
+
+
+
+
+
+
+
+
DeepLab2: A TensorFlow Library for Deep Labeling | Github Repo
" + +gr.Interface( + inference, + [gr.inputs.Image(type="pil", label="Input")], + gr.outputs.Image(type="plot", label="Output"), + title=title, + description=description, + article=article, + examples=[ + ["city1.jpg"], + ["city2.jpg"] + ]).launch() diff --git a/model.proto b/model.proto new file mode 100644 index 0000000000000000000000000000000000000000..c4dd1a8afb9cb0c6b73803dd43a0c583b45e9997 --- /dev/null +++ b/model.proto @@ -0,0 +1,198 @@ +// Copyright 2021 The Deeplab2 Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package deeplab2; + +option java_multiple_files = true; + +/********** Submessages used to config model options **********/ +// Configure the decoder model options. +message DecoderOptions { + // Set the features key for the high-level features, e.g. 'res5'. + optional string feature_key = 1; + // Set the number of filters in each convolution of the decoder. + optional int32 decoder_channels = 2 [default = 256]; + // Set the decoder convolution type. Support 'depthwise_separable_conv' and + // 'standard_conv'. + optional string decoder_conv_type = 5 [default = 'depthwise_separable_conv']; + // Set the number of filters in each convolution of the ASPP. + optional int32 aspp_channels = 3 [default = 256]; + // Set the list of atrous rates used in the ASPP. Note that this field has + // to be of length 3 (to specify the three 3x3 atrous convolutions in ASPP), + // and it is effective only when `aspp_use_only_1x1_proj_conv` is false. + repeated int32 atrous_rates = 4; + // The ASPP module uses only 1x1 projection convolution (i.e., the ASPP five + // branches consisting of one 1x1 convolution, three 3x3 atrous convolutions + // with specified `atrous_rates`, and the global average pooling are turned + // off, when `aspp_use_only_1x1_proj_conv` is true), equivalent to applying + // only one 1x1 convolution to reduce the feature map channels (obtained from + // encoder backbone) to the specified `aspp_channels`. This field is mainly + // used (i.e., set to true) when the encoder backbone is already able to + // efficiently capture long-range information, e.g., by axial attention blocks + // (for reference, see configs/cityscapes/axial_deeplab). + optional bool aspp_use_only_1x1_proj_conv = 6 [default = false]; +} + +// Configure the low level features to use. +message LowLevelOptions { + // Set the name of the low-level feature, e.g. 'res2'. + optional string feature_key = 1; + // Set the number of filters for the 1x1 projection convolution. + optional int32 channels_project = 2; +} + +// Configure the head options. +message HeadOptions { + // Set the number of filters in the last convolution, e.g. 1 or NUM_CLASSES. + optional int32 output_channels = 1; + // Set the number of filters in the 5x5 convolution, e.g. 256 or 32. + optional int32 head_channels = 2; + // Set the head convolution type. Support 'depthwise_separable_conv' and + // 'standard_conv' + optional string head_conv_type = 3 [default = 'depthwise_separable_conv']; +} + +// Configure the instance branch. +message InstanceOptions { + // Set whether to use the instance branch. + optional bool enable = 1 [default = true]; + + // Set the low level options used in instance branch. The list of + // LowLevelOptions must be ordered lower resolution to higher resolution. + // Leaving it empty will use the same low level options as the semantic + // branch. + repeated LowLevelOptions low_level_override = 2; + // Set the decoder options of the instance branch. Leaving it empty will use + // the same decoder options as the semantic branch. + optional DecoderOptions instance_decoder_override = 3; + + // Configure instance center head. + optional HeadOptions center_head = 4; + // Configure instance regression head. + optional HeadOptions regression_head = 5; + + // Configure next-frame instance regression head. + optional HeadOptions next_regression_head = 6; +} + +// Configure the model options. +// Next ID: 12 +message ModelOptions { + // Configure model backbone. + message BackboneOptions { + // Set the name of the specific architecture of the family. + optional string name = 1 [default = 'resnet50']; + // Set the output stride of the encoder. + optional int32 output_stride = 2 [default = 32]; + // Set path to pretrained weights to load pretrained weights. + optional string pretrained_weights = 3; + // Set whether to use the squeeze-and-excite operation. + optional bool use_squeeze_and_excite = 4 [default = false]; + // Set the drop path keep probability for training. Default not to use. + optional float drop_path_keep_prob = 5 [default = 1.0]; + // Set the drop path schedule. Currently support (1) 'constant': use the + // same drop path probability for all blocks, and (2) 'linear': linearly + // decrease the drop path probability from 1.0 at the 0-th stage (or STEM) + // to drop_path_keep_prob at the last block. + optional string drop_path_schedule = 6 [default = 'constant']; + // Set the STEM width_multiplier, controlloing STEM convolution channels. + optional float stem_width_multiplier = 7 [default = 1.0]; + // Set the backbone (except STEM) width_multiplier, controlling backbone + // (except STEM) convolution channels. + optional float backbone_width_multiplier = 8 [default = 1.0]; + // Set the backbone (except STEM) layer_multiplier, controlling the number + // of layers in the backbone (except STEM). + optional float backbone_layer_multiplier = 9 [default = 1.0]; + // Use the Switchable Atrous Convolution (SAC) beyond the specified stride. + // For example, if use_sac_beyond_stride = 16, SAC will be applied to the + // network stage whose original output stride >= 16 (i.e., 16 and 32, or + // the last two stages). Set to -1 to disable it. + optional int32 use_sac_beyond_stride = 10 [default = -1]; + } + // Set the model option for the backbone encoder model. + optional BackboneOptions backbone = 1; + + // Shared decoder settings across different meta architectures. + optional DecoderOptions decoder = 2; + + // Meta-architecture specific settings. + message DeeplabV3Options { + // Set the number of classes for the last convolution to predict logits. + optional int32 num_classes = 1; + } + + message DeeplabV3PlusOptions { + // Set the low level options used in this decoder. The list of + // LowLevelOptions must be ordered from higher to lower levels. + optional LowLevelOptions low_level = 1; + + // Set the number of classes for the last convolution to predict logits. + optional int32 num_classes = 2; + } + + message PanopticDeeplabOptions { + // Set the low level options used in this decoder. The list of + // LowLevelOptions must be ordered lower resolution to higher resolution. + repeated LowLevelOptions low_level = 1; + // Set the model options for the instance branch. + optional InstanceOptions instance = 2; + // Set the model options of the semantic head. + optional HeadOptions semantic_head = 3; + } + + message MotionDeepLabOptions { + // Set the low level options used in this decoder. The list of + // LowLevelOptions must be ordered lower resolution to higher resolution. + repeated LowLevelOptions low_level = 1; + // Set the model options for the instance branch. + optional InstanceOptions instance = 2; + // Set the model options of the semantic head. + optional HeadOptions semantic_head = 3; + // Set the model options for the motion head. + optional HeadOptions motion_head = 4; + } + + message MaXDeepLabOptions { + // Set the head options of the mask head. + optional HeadOptions pixel_space_head = 1; + // Set the low level options used in the semantic decoder. The list of + // LowLevelOptions must be ordered lower resolution to higher resolution. + repeated LowLevelOptions auxiliary_low_level = 2; + // Set the head options of the semantic head. + optional HeadOptions auxiliary_semantic_head = 3; + } + + oneof meta_architecture { + DeeplabV3Options deeplab_v3 = 3; + DeeplabV3PlusOptions deeplab_v3_plus = 4; + PanopticDeeplabOptions panoptic_deeplab = 5; + MotionDeepLabOptions motion_deeplab = 7; + MaXDeepLabOptions max_deeplab = 10; + PanopticDeeplabOptions vip_deeplab = 11; + } + // Set the checkpoint to load. + optional string initial_checkpoint = 6; + // Set whether to restore the last convolution of the semantic head when + // loading from the initial checkpoint. Setting this flag to false is useful + // when an initial checkpoint was trained on a dataset with different classes. + optional bool restore_semantic_last_layer_from_initial_checkpoint = 8 + [default = true]; + // Set whether to restore the last convolution of the instance heads when + // loading from the initial checkpoint. Depending on the meta architecture, + // this includes center heatmap, center regression and motion regression. + optional bool restore_instance_last_layer_from_initial_checkpoint = 9 + [default = true]; +} diff --git a/model/__init__.py b/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03 --- /dev/null +++ b/model/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/model/builder.py b/model/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..9983b3e7f38597a384aa99e9ab9a32158c3eef46 --- /dev/null +++ b/model/builder.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains functions to build encoder and decoder.""" +import tensorflow as tf + +from deeplab2 import config_pb2 +from deeplab2.model.decoder import deeplabv3 +from deeplab2.model.decoder import deeplabv3plus +from deeplab2.model.decoder import max_deeplab +from deeplab2.model.decoder import motion_deeplab_decoder +from deeplab2.model.decoder import panoptic_deeplab +from deeplab2.model.decoder import vip_deeplab_decoder +from deeplab2.model.encoder import axial_resnet_instances +from deeplab2.model.encoder import mobilenet + + +def create_encoder(backbone_options: config_pb2.ModelOptions.BackboneOptions, + bn_layer: tf.keras.layers.Layer, + conv_kernel_weight_decay: float = 0.0) -> tf.keras.Model: + """Creates an encoder. + + Args: + backbone_options: A proto config of type + config_pb2.ModelOptions.BackboneOptions. + bn_layer: A tf.keras.layers.Layer that computes the normalization. + conv_kernel_weight_decay: A float, the weight decay for convolution kernels. + + Returns: + An instance of tf.keras.Model containing the encoder. + + Raises: + ValueError: An error occurs when the specified encoder meta architecture is + not supported. + """ + if ('resnet' in backbone_options.name or + 'swidernet' in backbone_options.name or + 'axial_deeplab' in backbone_options.name or + 'max_deeplab' in backbone_options.name): + return create_resnet_encoder( + backbone_options, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay) + elif 'mobilenet' in backbone_options.name: + return create_mobilenet_encoder( + backbone_options, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay) + raise ValueError('The specified encoder %s is not a valid encoder.' % + backbone_options.name) + + +def create_mobilenet_encoder( + backbone_options: config_pb2.ModelOptions.BackboneOptions, + bn_layer: tf.keras.layers.Layer, + conv_kernel_weight_decay: float = 0.0) -> tf.keras.Model: + """Creates a MobileNet encoder specified by name. + + Args: + backbone_options: A proto config of type + config_pb2.ModelOptions.BackboneOptions. + bn_layer: A tf.keras.layers.Layer that computes the normalization. + conv_kernel_weight_decay: A float, the weight decay for convolution kernels. + + Returns: + An instance of tf.keras.Model containing the MobileNet encoder. + """ + if backbone_options.name.lower() == 'mobilenet_v3_large': + backbone = mobilenet.MobileNetV3Large + elif backbone_options.name.lower() == 'mobilenet_v3_small': + backbone = mobilenet.MobileNetV3Small + else: + raise ValueError('The specified encoder %s is not a valid encoder.' % + backbone_options.name) + assert backbone_options.use_squeeze_and_excite + assert backbone_options.drop_path_keep_prob == 1 + assert backbone_options.use_sac_beyond_stride == -1 + assert backbone_options.backbone_layer_multiplier == 1 + return backbone( + output_stride=backbone_options.output_stride, + width_multiplier=backbone_options.backbone_width_multiplier, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay) + + +def create_resnet_encoder( + backbone_options: config_pb2.ModelOptions.BackboneOptions, + bn_layer: tf.keras.layers.Layer, + conv_kernel_weight_decay: float = 0.0) -> tf.keras.Model: + """Creates a ResNet encoder specified by name. + + Args: + backbone_options: A proto config of type + config_pb2.ModelOptions.BackboneOptions. + bn_layer: A tf.keras.layers.Layer that computes the normalization. + conv_kernel_weight_decay: A float, the weight decay for convolution kernels. + + Returns: + An instance of tf.keras.Model containing the ResNet encoder. + """ + return axial_resnet_instances.get_model( + backbone_options.name, + output_stride=backbone_options.output_stride, + stem_width_multiplier=backbone_options.stem_width_multiplier, + width_multiplier=backbone_options.backbone_width_multiplier, + backbone_layer_multiplier=backbone_options.backbone_layer_multiplier, + block_group_config={ + 'use_squeeze_and_excite': backbone_options.use_squeeze_and_excite, + 'drop_path_keep_prob': backbone_options.drop_path_keep_prob, + 'drop_path_schedule': backbone_options.drop_path_schedule, + 'use_sac_beyond_stride': backbone_options.use_sac_beyond_stride}, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay) + + +def create_decoder(model_options: config_pb2.ModelOptions, + bn_layer: tf.keras.layers.Layer, + ignore_label: int) -> tf.keras.Model: + """Creates a DeepLab decoder. + + Args: + model_options: A proto config of type config_pb2.ModelOptions. + bn_layer: A tf.keras.layers.Layer that computes the normalization. + ignore_label: An integer specifying the ignore label. + + Returns: + An instance of tf.keras.layers.Layer containing the decoder. + + Raises: + ValueError: An error occurs when the specified meta architecture is not + supported. + """ + meta_architecture = model_options.WhichOneof('meta_architecture') + if meta_architecture == 'deeplab_v3': + return deeplabv3.DeepLabV3( + model_options.decoder, model_options.deeplab_v3, bn_layer=bn_layer) + elif meta_architecture == 'deeplab_v3_plus': + return deeplabv3plus.DeepLabV3Plus( + model_options.decoder, model_options.deeplab_v3_plus, bn_layer=bn_layer) + elif meta_architecture == 'panoptic_deeplab': + return panoptic_deeplab.PanopticDeepLab( + model_options.decoder, + model_options.panoptic_deeplab, + bn_layer=bn_layer) + elif meta_architecture == 'motion_deeplab': + return motion_deeplab_decoder.MotionDeepLabDecoder( + model_options.decoder, + model_options.motion_deeplab, + bn_layer=bn_layer) + elif meta_architecture == 'vip_deeplab': + return vip_deeplab_decoder.ViPDeepLabDecoder( + model_options.decoder, + model_options.vip_deeplab, + bn_layer=bn_layer) + elif meta_architecture == 'max_deeplab': + return max_deeplab.MaXDeepLab( + model_options.decoder, + model_options.max_deeplab, + ignore_label=ignore_label, + bn_layer=bn_layer) + raise ValueError('The specified meta architecture %s is not implemented.' % + meta_architecture) diff --git a/model/builder_test.py b/model/builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..6fd603127caf05c0c72bc892c8bb93a7c81393be --- /dev/null +++ b/model/builder_test.py @@ -0,0 +1,80 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for model.builder.""" + +import os +from absl.testing import parameterized + +import tensorflow as tf + +from google.protobuf import text_format +from deeplab2 import config_pb2 +from deeplab2.model import builder +from deeplab2.model.decoder import motion_deeplab_decoder +from deeplab2.model.encoder import axial_resnet_instances +from deeplab2.model.encoder import mobilenet +# resources dependency + + +_CONFIG_PATH = 'deeplab2/configs/example' + + +def _read_proto_file(filename, proto): + filename = filename # OSS: removed internal filename loading. + with tf.io.gfile.GFile(filename, 'r') as proto_file: + return text_format.ParseLines(proto_file, proto) + + +class BuilderTest(tf.test.TestCase, parameterized.TestCase): + + def test_resnet50_encoder_creation(self): + backbone_options = config_pb2.ModelOptions.BackboneOptions( + name='resnet50', output_stride=32) + encoder = builder.create_encoder( + backbone_options, + tf.keras.layers.experimental.SyncBatchNormalization) + self.assertIsInstance(encoder, axial_resnet_instances.ResNet50) + + @parameterized.parameters('mobilenet_v3_large', 'mobilenet_v3_small') + def test_mobilenet_encoder_creation(self, model_name): + backbone_options = config_pb2.ModelOptions.BackboneOptions( + name=model_name, use_squeeze_and_excite=True, output_stride=32) + encoder = builder.create_encoder( + backbone_options, + tf.keras.layers.experimental.SyncBatchNormalization) + self.assertIsInstance(encoder, mobilenet.MobileNet) + + def test_resnet_encoder_creation(self): + backbone_options = config_pb2.ModelOptions.BackboneOptions( + name='max_deeplab_s', output_stride=32) + encoder = builder.create_resnet_encoder( + backbone_options, + bn_layer=tf.keras.layers.experimental.SyncBatchNormalization) + self.assertIsInstance(encoder, axial_resnet_instances.MaXDeepLabS) + + def test_decoder_creation(self): + proto_filename = os.path.join( + _CONFIG_PATH, 'example_kitti-step_motion_deeplab.textproto') + model_options = _read_proto_file(proto_filename, config_pb2.ModelOptions()) + motion_decoder = builder.create_decoder( + model_options, tf.keras.layers.experimental.SyncBatchNormalization, + ignore_label=255) + self.assertIsInstance(motion_decoder, + motion_deeplab_decoder.MotionDeepLabDecoder) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/decoder/__init__.py b/model/decoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03 --- /dev/null +++ b/model/decoder/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/model/decoder/aspp.py b/model/decoder/aspp.py new file mode 100644 index 0000000000000000000000000000000000000000..32cc3e4f66c6ede6a6f32922933d32a0724c7f80 --- /dev/null +++ b/model/decoder/aspp.py @@ -0,0 +1,289 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains code to build an ASPP layer. + +Reference: + - [Rethinking Atrous Convolution for Semantic Image Segmentation]( + https://arxiv.org/pdf/1706.05587.pdf) + - [ParseNet: Looking Wider to See Better]( + https://arxiv.org/pdf/1506.04579.pdf). +""" +from absl import logging +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import convolutions + + +layers = tf.keras.layers +backend = tf.keras.backend + + +class ASPPConv(tf.keras.layers.Layer): + """An atrous convolution for ASPP.""" + + def __init__(self, + output_channels, + atrous_rate, + name, + bn_layer=tf.keras.layers.BatchNormalization): + """Creates a atrous convolution layer for the ASPP. + + This layer consists of an atrous convolution followed by a BatchNorm layer + and a ReLU activation. + + Args: + output_channels: An integer specifying the number of output channels of + the convolution. + atrous_rate: An integer specifying the atrous/dilation rate of the + convolution. + name: A string specifying the name of this layer. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + """ + super(ASPPConv, self).__init__(name=name) + + self._conv_bn_act = convolutions.Conv2DSame( + output_channels, + kernel_size=3, + name='conv_bn_act', + atrous_rate=atrous_rate, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='relu') + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + The output tensor. + """ + return self._conv_bn_act(input_tensor, training=training) + + +class ASPPPool(tf.keras.layers.Layer): + """A pooling layer for ASPP.""" + + def __init__(self, + output_channels, + name, + bn_layer=tf.keras.layers.BatchNormalization): + """Creates a pooling layer for the ASPP. + + This layer consists of a global average pooling, followed by a convolution, + and by a BatchNorm layer and a ReLU activation. + + Args: + output_channels: An integer specifying the number of output channels of + the convolution. + name: A string specifying the name of this layer. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + """ + super(ASPPPool, self).__init__(name=name) + + self._pool_size = (None, None) + self._conv_bn_act = convolutions.Conv2DSame( + output_channels, + kernel_size=1, + name='conv_bn_act', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='relu') + + def set_pool_size(self, pool_size): + """Sets the pooling size of the pooling layer. + + The default behavior of the pooling layer is global average pooling. A + custom pooling size can be set here. + + Args: + pool_size: A tuple specifying the pooling size of the pooling layer. + + Raises: + An error occurs if exactly one pooling dimension is set to 'None'. + """ + # If exactly one pooling dimension is 'None' raise an error. + if None in pool_size and pool_size != (None, None): + raise ValueError('The ASPP pooling layer requires that the pooling size ' + 'is set explicitly for both dimensions. In case, global ' + 'average pooling should be used, call ' + 'reset_pooling_layer() or set both to None.') + + self._pool_size = pool_size + logging.info('Global average pooling in the ASPP pooling layer was replaced' + ' with tiled average pooling using the provided pool_size. ' + 'Please make sure this behavior is intended.') + + def get_pool_size(self): + return self._pool_size + + def reset_pooling_layer(self): + """Resets the pooling layer to global average pooling.""" + self._pool_size = (None, None) + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + The output tensor. + """ + if tuple(self._pool_size) == (None, None): + # Global image pooling + pool_size = input_tensor.shape[1:3] + else: + # Tiled image pooling + pool_size = self._pool_size + + x = backend.pool2d(input_tensor, pool_size, padding='valid', + pool_mode='avg') + x = self._conv_bn_act(x, training=training) + + target_h = tf.shape(input_tensor)[1] + target_w = tf.shape(input_tensor)[2] + + x = utils.resize_align_corners(x, [target_h, target_w]) + return x + + +class ASPP(tf.keras.layers.Layer): + """An atrous spatial pyramid pooling layer.""" + + def __init__(self, + output_channels, + atrous_rates, + aspp_use_only_1x1_proj_conv=False, + name='ASPP', + bn_layer=tf.keras.layers.BatchNormalization): + """Creates an ASPP layer. + + Args: + output_channels: An integer specifying the number of output channels of + each ASPP convolution layer. + atrous_rates: A list of three integers specifying the atrous/dilation rate + of each ASPP convolution layer. + aspp_use_only_1x1_proj_conv: Boolean, specifying if the ASPP five branches + are turned off or not. If True, the ASPP module is degenerated to one + 1x1 convolution, projecting the input channels to `output_channels`. + name: A string specifying the name of this layer (default: 'ASPP'). + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + + Raises: + ValueError: An error occurs when both atrous_rates does not contain 3 + elements and `aspp_use_only_1x1_proj_conv` is False. + """ + super(ASPP, self).__init__(name=name) + + if not aspp_use_only_1x1_proj_conv and len(atrous_rates) != 3: + raise ValueError( + 'The ASPP layers need exactly 3 atrous rates, but %d were given' % + len(atrous_rates)) + self._aspp_use_only_1x1_proj_conv = aspp_use_only_1x1_proj_conv + + # Projection convolution is always used. + self._proj_conv_bn_act = convolutions.Conv2DSame( + output_channels, + kernel_size=1, + name='proj_conv_bn_act', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='relu') + + if not aspp_use_only_1x1_proj_conv: + self._conv_bn_act = convolutions.Conv2DSame( + output_channels, + kernel_size=1, + name='conv_bn_act', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='relu') + rate1, rate2, rate3 = atrous_rates + self._aspp_conv1 = ASPPConv(output_channels, rate1, name='aspp_conv1', + bn_layer=bn_layer) + self._aspp_conv2 = ASPPConv(output_channels, rate2, name='aspp_conv2', + bn_layer=bn_layer) + self._aspp_conv3 = ASPPConv(output_channels, rate3, name='aspp_conv3', + bn_layer=bn_layer) + self._aspp_pool = ASPPPool(output_channels, name='aspp_pool', + bn_layer=bn_layer) + # Dropout is needed only when ASPP five branches are used. + self._proj_drop = layers.Dropout(rate=0.1) + + def set_pool_size(self, pool_size): + """Sets the pooling size of the ASPP pooling layer. + + The default behavior of the pooling layer is global average pooling. A + custom pooling size can be set here. + + Args: + pool_size: A tuple specifying the pooling size of the ASPP pooling layer. + """ + if not self._aspp_use_only_1x1_proj_conv: + self._aspp_pool.set_pool_size(pool_size) + + def get_pool_size(self): + if not self._aspp_use_only_1x1_proj_conv: + return self._aspp_pool.get_pool_size() + else: + return (None, None) + + def reset_pooling_layer(self): + """Resets the pooling layer to global average pooling.""" + self._aspp_pool.reset_pooling_layer() + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + The output tensor. + """ + if self._aspp_use_only_1x1_proj_conv: + x = self._proj_conv_bn_act(input_tensor, training=training) + else: + # Apply the ASPP module. + results = [] + results.append(self._conv_bn_act(input_tensor, training=training)) + results.append(self._aspp_conv1(input_tensor, training=training)) + results.append(self._aspp_conv2(input_tensor, training=training)) + results.append(self._aspp_conv3(input_tensor, training=training)) + results.append(self._aspp_pool(input_tensor, training=training)) + x = tf.concat(results, 3) + x = self._proj_conv_bn_act(x, training=training) + x = self._proj_drop(x, training=training) + return x diff --git a/model/decoder/aspp_test.py b/model/decoder/aspp_test.py new file mode 100644 index 0000000000000000000000000000000000000000..75f952b3daece31efec51c05ce4387837002b216 --- /dev/null +++ b/model/decoder/aspp_test.py @@ -0,0 +1,91 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for aspp.""" +import tensorflow as tf + +from deeplab2.model.decoder import aspp +from deeplab2.utils import test_utils + + +class AsppTest(tf.test.TestCase): + + def test_aspp_pool_error(self): + pool = aspp.ASPPPool(output_channels=64, name='') + + # Should pass without an error. + pool.set_pool_size((None, None)) + + with self.assertRaises(ValueError): + # Should raise an error. + pool.set_pool_size((2, None)) + + def test_aspp_conv_atrous_rate_shape(self): + atrous_rates = [2, 6, 12, 18] + for rate in atrous_rates: + conv = aspp.ASPPConv(output_channels=64, atrous_rate=rate, name='') + input_tensor = tf.random.uniform(shape=(2, 12, 12, 3)) + + output = conv(input_tensor) + expected_shape = [2, 12, 12, 64] + self.assertListEqual(output.shape.as_list(), expected_shape) + + def test_aspp_conv_non_negative(self): + conv = aspp.ASPPConv(output_channels=12, atrous_rate=2, name='') + input_tensor = tf.random.uniform(shape=(2, 17, 17, 3)) + + output = conv(input_tensor) + self.assertTrue((output.numpy() >= 0.0).all()) + + def test_aspp_pool_shape(self): + pool = aspp.ASPPPool(output_channels=64, name='') + input_tensor = tf.random.uniform(shape=(2, 12, 12, 3)) + + output = pool(input_tensor) + expected_shape = [2, 12, 12, 64] + self.assertListEqual(output.shape.as_list(), expected_shape) + + def test_aspp_pool_non_negative(self): + pool = aspp.ASPPPool(output_channels=12, name='') + input_tensor = tf.random.uniform(shape=(2, 17, 17, 3)) + + output = pool(input_tensor) + self.assertTrue((output.numpy() >= 0.0).all()) + + def test_aspp_wrong_atrous_rate(self): + with self.assertRaises(ValueError): + _ = aspp.ASPP(output_channels=64, atrous_rates=[1, 2, 3, 4]) + + @test_utils.test_all_strategies + def test_aspp_shape(self, strategy): + with strategy.scope(): + for bn_layer in test_utils.NORMALIZATION_LAYERS: + aspp_layer = aspp.ASPP( + output_channels=64, atrous_rates=[6, 12, 18], bn_layer=bn_layer) + input_tensor = tf.random.uniform(shape=(2, 32, 32, 3)) + + output = aspp_layer(input_tensor) + expected_shape = [2, 32, 32, 64] + self.assertListEqual(output.shape.as_list(), expected_shape) + + def test_aspp_non_negative(self): + aspp_layer = aspp.ASPP(output_channels=32, atrous_rates=[4, 8, 16]) + input_tensor = tf.random.uniform(shape=(2, 32, 32, 3)) + + output = aspp_layer(input_tensor) + self.assertTrue((output.numpy() >= 0.0).all()) + +if __name__ == '__main__': + tf.test.main() diff --git a/model/decoder/deeplabv3.py b/model/decoder/deeplabv3.py new file mode 100644 index 0000000000000000000000000000000000000000..f3217543510dcae2f89f396534ab4a0c15ccff0a --- /dev/null +++ b/model/decoder/deeplabv3.py @@ -0,0 +1,121 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains code to build a DeepLabV3. + +Reference: + - [Rethinking Atrous Convolution for Semantic Image Segmentation]( + https://arxiv.org/pdf/1706.05587.pdf) +""" +import tensorflow as tf + +from deeplab2 import common +from deeplab2.model.decoder import aspp +from deeplab2.model.layers import convolutions + + +layers = tf.keras.layers + + +class DeepLabV3(layers.Layer): + """A DeepLabV3 model. + + This model takes in features from an encoder and performs multi-scale context + aggregation with the help of an ASPP layer. Finally, a classification head is + used to predict a semantic segmentation. + """ + + def __init__(self, + decoder_options, + deeplabv3_options, + bn_layer=tf.keras.layers.BatchNormalization): + """Creates a DeepLabV3 decoder of type layers.Layer. + + Args: + decoder_options: Decoder options as defined in config_pb2.DecoderOptions. + deeplabv3_options: Model options as defined in + config_pb2.ModelOptions.DeeplabV3Options. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + """ + super(DeepLabV3, self).__init__(name='DeepLabV3') + + self._feature_name = decoder_options.feature_key + self._aspp = aspp.ASPP(decoder_options.aspp_channels, + decoder_options.atrous_rates, + bn_layer=bn_layer) + + self._classifier_conv_bn_act = convolutions.Conv2DSame( + decoder_options.decoder_channels, + kernel_size=3, + name='classifier_conv_bn_act', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='relu') + + self._final_conv = convolutions.Conv2DSame( + deeplabv3_options.num_classes, kernel_size=1, name='final_conv') + + def set_pool_size(self, pool_size): + """Sets the pooling size of the ASPP pooling layer. + + Args: + pool_size: A tuple specifying the pooling size of the ASPP pooling layer. + """ + self._aspp.set_pool_size(pool_size) + + def get_pool_size(self): + return self._aspp.get_pool_size() + + def reset_pooling_layer(self): + """Resets the ASPP pooling layer to global average pooling.""" + self._aspp.reset_pooling_layer() + + def call(self, features, training=False): + """Performs a forward pass. + + Args: + features: A single input tf.Tensor or an input dict of tf.Tensor with + shape [batch, height, width, channels]. If passed a dict, different keys + should point to different features extracted by the encoder, e.g. + low-level or high-level features. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + A dictionary containing the semantic prediction under key + common.PRED_SEMANTIC_LOGITS_KEY. + """ + if isinstance(features, tf.Tensor): + feature = features + else: + feature = features[self._feature_name] + + x = self._aspp(feature, training=training) + + x = self._classifier_conv_bn_act(x, training=training) + + return {common.PRED_SEMANTIC_LOGITS_KEY: self._final_conv(x)} + + @property + def checkpoint_items(self): + items = { + common.CKPT_DEEPLABV3_ASPP: self._aspp, + common.CKPT_DEEPLABV3_CLASSIFIER_CONV_BN_ACT: + self._classifier_conv_bn_act, + common.CKPT_SEMANTIC_LAST_LAYER: self._final_conv, + } + return items diff --git a/model/decoder/deeplabv3_test.py b/model/decoder/deeplabv3_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9cf6698585cb0ce5d14b53021cbe631ad26a1848 --- /dev/null +++ b/model/decoder/deeplabv3_test.py @@ -0,0 +1,143 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for deeplabv3.""" + +import numpy as np +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.model.decoder import deeplabv3 +from deeplab2.utils import test_utils + + +def _create_deeplabv3_model(feature_key, decoder_channels, aspp_channels, + atrous_rates, num_classes, **kwargs): + decoder_options = config_pb2.DecoderOptions( + feature_key=feature_key, + decoder_channels=decoder_channels, + aspp_channels=aspp_channels, + atrous_rates=atrous_rates) + deeplabv3_options = config_pb2.ModelOptions.DeeplabV3Options( + num_classes=num_classes) + return deeplabv3.DeepLabV3(decoder_options, deeplabv3_options, **kwargs) + + +class Deeplabv3Test(tf.test.TestCase): + + def test_deeplabv3_feature_key_not_present(self): + deeplabv3_decoder = _create_deeplabv3_model( + feature_key='not_in_features_dict', + aspp_channels=64, + decoder_channels=48, + atrous_rates=[6, 12, 18], + num_classes=80) + input_dict = dict() + input_dict['not_the_same_key'] = tf.random.uniform(shape=(2, 65, 65, 32)) + + with self.assertRaises(KeyError): + _ = deeplabv3_decoder(input_dict) + + def test_deeplabv3_output_shape(self): + list_of_num_classes = [2, 19, 133] + for num_classes in list_of_num_classes: + deeplabv3_decoder = _create_deeplabv3_model( + feature_key='not_used', + aspp_channels=64, + decoder_channels=48, + atrous_rates=[6, 12, 18], + num_classes=num_classes) + input_tensor = tf.random.uniform(shape=(2, 65, 65, 32)) + expected_shape = [2, 65, 65, num_classes] + + logit_tensor = deeplabv3_decoder(input_tensor) + self.assertListEqual( + logit_tensor[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(), + expected_shape) + + @test_utils.test_all_strategies + def test_sync_bn(self, strategy): + input_tensor = tf.random.uniform(shape=(2, 65, 65, 32)) + with strategy.scope(): + for bn_layer in test_utils.NORMALIZATION_LAYERS: + deeplabv3_decoder = _create_deeplabv3_model( + feature_key='not_used', + aspp_channels=64, + decoder_channels=48, + atrous_rates=[6, 12, 18], + num_classes=19, + bn_layer=bn_layer) + _ = deeplabv3_decoder(input_tensor) + + def test_deeplabv3_feature_extraction_consistency(self): + deeplabv3_decoder = _create_deeplabv3_model( + aspp_channels=64, + decoder_channels=48, + atrous_rates=[6, 12, 18], + num_classes=80, + feature_key='feature_key') + input_tensor = tf.random.uniform(shape=(2, 65, 65, 32)) + input_dict = dict() + input_dict['feature_key'] = input_tensor + + reference_logits_tensor = deeplabv3_decoder(input_tensor, training=False) + logits_tensor_to_compare = deeplabv3_decoder(input_dict, training=False) + + np.testing.assert_equal( + reference_logits_tensor[common.PRED_SEMANTIC_LOGITS_KEY].numpy(), + logits_tensor_to_compare[common.PRED_SEMANTIC_LOGITS_KEY].numpy()) + + def test_deeplabv3_pool_size_setter(self): + deeplabv3_decoder = _create_deeplabv3_model( + feature_key='not_used', + aspp_channels=64, + decoder_channels=48, + atrous_rates=[6, 12, 18], + num_classes=80) + pool_size = (10, 10) + deeplabv3_decoder.set_pool_size(pool_size) + + self.assertTupleEqual(deeplabv3_decoder._aspp._aspp_pool._pool_size, + pool_size) + + def test_deeplabv3_pool_size_resetter(self): + deeplabv3_decoder = _create_deeplabv3_model( + feature_key='not_used', + aspp_channels=64, + decoder_channels=48, + atrous_rates=[6, 12, 18], + num_classes=80) + pool_size = (None, None) + deeplabv3_decoder.reset_pooling_layer() + + self.assertTupleEqual(deeplabv3_decoder._aspp._aspp_pool._pool_size, + pool_size) + + def test_deeplabv3_ckpt_items(self): + deeplabv3_decoder = _create_deeplabv3_model( + feature_key='not_used', + aspp_channels=64, + decoder_channels=48, + atrous_rates=[6, 12, 18], + num_classes=80) + ckpt_dict = deeplabv3_decoder.checkpoint_items + self.assertIn(common.CKPT_DEEPLABV3_ASPP, ckpt_dict) + self.assertIn(common.CKPT_DEEPLABV3_CLASSIFIER_CONV_BN_ACT, ckpt_dict) + self.assertIn(common.CKPT_SEMANTIC_LAST_LAYER, ckpt_dict) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/decoder/deeplabv3plus.py b/model/decoder/deeplabv3plus.py new file mode 100644 index 0000000000000000000000000000000000000000..35d66f8ad95fc7ab6e3bbf54774a0c50cf105bbb --- /dev/null +++ b/model/decoder/deeplabv3plus.py @@ -0,0 +1,145 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains code to build a DeepLabV3Plus. + +Reference: + - [Encoder-Decoder with Atrous Separable Convolution for Semantic Image + Segmentation](https://arxiv.org/pdf/1802.02611.pdf) +""" + +import tensorflow as tf + +from deeplab2 import common +from deeplab2.model import utils +from deeplab2.model.decoder import aspp +from deeplab2.model.layers import convolutions + + +layers = tf.keras.layers + + +class DeepLabV3Plus(tf.keras.layers.Layer): + """A DeepLabV3+ decoder model. + + This model takes in low- and high-level features from an encoder and performs + multi-scale context aggregation with the help of an ASPP layer on high-level + features. These are concatenated with the low-level features and used as input + to the classification head that is used to predict a semantic segmentation. + """ + + def __init__(self, + decoder_options, + deeplabv3plus_options, + bn_layer=tf.keras.layers.BatchNormalization): + """Creates a DeepLabV3+ decoder of type tf.keras.layers.Layer. + + Args: + decoder_options: Decoder options as defined in config_pb2.DecoderOptions. + deeplabv3plus_options: Model options as defined in + config_pb2.ModelOptions.DeeplabV3PlusOptions. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + """ + super(DeepLabV3Plus, self).__init__(name='DeepLabv3Plus') + + self._high_level_feature_name = decoder_options.feature_key + self._low_level_feature_name = deeplabv3plus_options.low_level.feature_key + self._aspp = aspp.ASPP(decoder_options.aspp_channels, + decoder_options.atrous_rates, + bn_layer=bn_layer) + + # Layers for low-level feature transformation. + self._project_conv_bn_act = convolutions.Conv2DSame( + deeplabv3plus_options.low_level.channels_project, + kernel_size=1, + name='project_conv_bn_act', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='relu') + + # Layers for fusing low- and high-level features. + self._fuse = convolutions.StackedConv2DSame( + conv_type='depthwise_separable_conv', + num_layers=2, + output_channels=decoder_options.decoder_channels, + kernel_size=3, + name='fuse', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='relu') + + self._final_conv = convolutions.Conv2DSame( + deeplabv3plus_options.num_classes, kernel_size=1, name='final_conv') + + def reset_pooling_layer(self): + """Resets the ASPP pooling layer to global average pooling.""" + self._aspp.reset_pooling_layer() + + def set_pool_size(self, pool_size): + """Sets the pooling size of the ASPP pooling layer. + + Args: + pool_size: A tuple specifying the pooling size of the ASPP pooling layer. + """ + self._aspp.set_pool_size(pool_size) + + def get_pool_size(self): + return self._aspp.get_pool_size() + + @property + def checkpoint_items(self): + items = { + common.CKPT_DEEPLABV3PLUS_ASPP: self._aspp, + common.CKPT_DEEPLABV3PLUS_PROJECT_CONV_BN_ACT: + self._project_conv_bn_act, + common.CKPT_DEEPLABV3PLUS_FUSE: self._fuse, + common.CKPT_SEMANTIC_LAST_LAYER: self._final_conv, + } + return items + + def call(self, features, training=False): + """Performs a forward pass. + + Args: + features: An input dict of tf.Tensor with shape [batch, height, width, + channels]. Different keys should point to different features extracted + by the encoder, e.g. low-level or high-level features. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + A dictionary containing the semantic prediction under key + common.PRED_SEMANTIC_LOGITS_KEY. + """ + low_level_features = features[self._low_level_feature_name] + high_level_features = features[self._high_level_feature_name] + + high_level_features = self._aspp(high_level_features, training=training) + + low_level_features = self._project_conv_bn_act(low_level_features, + training=training) + + target_h = tf.shape(low_level_features)[1] + target_w = tf.shape(low_level_features)[2] + + high_level_features = utils.resize_align_corners( + high_level_features, [target_h, target_w]) + x = tf.concat([high_level_features, low_level_features], 3) + x = self._fuse(x) + + return {common.PRED_SEMANTIC_LOGITS_KEY: self._final_conv(x)} diff --git a/model/decoder/deeplabv3plus_test.py b/model/decoder/deeplabv3plus_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1419b55acc0a5973e414ca7a12d2716d0f838b57 --- /dev/null +++ b/model/decoder/deeplabv3plus_test.py @@ -0,0 +1,169 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for deeplabv3plus.""" + +import numpy as np +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.model.decoder import deeplabv3plus +from deeplab2.utils import test_utils + + +def _create_deeplabv3plus_model(high_level_feature_name, low_level_feature_name, + low_level_channels_project, + aspp_output_channels, decoder_output_channels, + atrous_rates, num_classes, **kwargs): + decoder_options = config_pb2.DecoderOptions( + feature_key=high_level_feature_name, + decoder_channels=decoder_output_channels, + aspp_channels=aspp_output_channels, + atrous_rates=atrous_rates) + deeplabv3plus_options = config_pb2.ModelOptions.DeeplabV3PlusOptions( + low_level=config_pb2.LowLevelOptions( + feature_key=low_level_feature_name, + channels_project=low_level_channels_project), + num_classes=num_classes) + return deeplabv3plus.DeepLabV3Plus(decoder_options, deeplabv3plus_options, + **kwargs) + + +class Deeplabv3PlusTest(tf.test.TestCase): + + def test_deeplabv3plus_feature_key_not_present(self): + deeplabv3plus_decoder = _create_deeplabv3plus_model( + high_level_feature_name='not_in_features_dict', + low_level_feature_name='in_feature_dict', + low_level_channels_project=128, + aspp_output_channels=64, + decoder_output_channels=64, + atrous_rates=[6, 12, 18], + num_classes=80) + input_dict = dict() + input_dict['in_feature_dict'] = tf.random.uniform(shape=(2, 65, 65, 32)) + + with self.assertRaises(KeyError): + _ = deeplabv3plus_decoder(input_dict) + + def test_deeplabv3plus_output_shape(self): + list_of_num_classes = [2, 19, 133] + for num_classes in list_of_num_classes: + deeplabv3plus_decoder = _create_deeplabv3plus_model( + high_level_feature_name='high', + low_level_feature_name='low', + low_level_channels_project=128, + aspp_output_channels=64, + decoder_output_channels=128, + atrous_rates=[6, 12, 18], + num_classes=num_classes) + input_dict = dict() + input_dict['high'] = tf.random.uniform(shape=(2, 65, 65, 32)) + input_dict['low'] = tf.random.uniform(shape=(2, 129, 129, 16)) + expected_shape = [2, 129, 129, num_classes] + + logit_tensor = deeplabv3plus_decoder(input_dict) + self.assertListEqual( + logit_tensor[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(), + expected_shape) + + def test_deeplabv3plus_feature_extraction_consistency(self): + deeplabv3plus_decoder = _create_deeplabv3plus_model( + high_level_feature_name='high', + low_level_feature_name='low', + low_level_channels_project=128, + aspp_output_channels=96, + decoder_output_channels=64, + atrous_rates=[6, 12, 18], + num_classes=80) + input_dict = dict() + input_dict['high'] = tf.random.uniform(shape=(2, 65, 65, 32)) + input_dict['low'] = tf.random.uniform(shape=(2, 129, 129, 16)) + + reference_logits_tensor = deeplabv3plus_decoder( + input_dict, training=False) + logits_tensor_to_compare = deeplabv3plus_decoder(input_dict, training=False) + + np.testing.assert_equal( + reference_logits_tensor[common.PRED_SEMANTIC_LOGITS_KEY].numpy(), + logits_tensor_to_compare[common.PRED_SEMANTIC_LOGITS_KEY].numpy()) + + def test_deeplabv3plus_pool_size_setter(self): + deeplabv3plus_decoder = _create_deeplabv3plus_model( + high_level_feature_name='high', + low_level_feature_name='low', + low_level_channels_project=128, + aspp_output_channels=96, + decoder_output_channels=64, + atrous_rates=[6, 12, 18], + num_classes=80) + pool_size = (10, 10) + deeplabv3plus_decoder.set_pool_size(pool_size) + + self.assertTupleEqual(deeplabv3plus_decoder._aspp._aspp_pool._pool_size, + pool_size) + + @test_utils.test_all_strategies + def test_deeplabv3plus_sync_bn(self, strategy): + input_dict = dict() + input_dict['high'] = tf.random.uniform(shape=(2, 65, 65, 32)) + input_dict['low'] = tf.random.uniform(shape=(2, 129, 129, 16)) + with strategy.scope(): + for bn_layer in test_utils.NORMALIZATION_LAYERS: + deeplabv3plus_decoder = _create_deeplabv3plus_model( + high_level_feature_name='high', + low_level_feature_name='low', + low_level_channels_project=128, + aspp_output_channels=96, + decoder_output_channels=64, + atrous_rates=[6, 12, 18], + num_classes=80, + bn_layer=bn_layer) + _ = deeplabv3plus_decoder(input_dict) + + def test_deeplabv3plus_pool_size_resetter(self): + deeplabv3plus_decoder = _create_deeplabv3plus_model( + high_level_feature_name='high', + low_level_feature_name='low', + low_level_channels_project=128, + aspp_output_channels=96, + decoder_output_channels=64, + atrous_rates=[6, 12, 18], + num_classes=80) + pool_size = (None, None) + deeplabv3plus_decoder.reset_pooling_layer() + + self.assertTupleEqual(deeplabv3plus_decoder._aspp._aspp_pool._pool_size, + pool_size) + + def test_deeplabv3plus_ckpt_items(self): + deeplabv3plus_decoder = _create_deeplabv3plus_model( + high_level_feature_name='high', + low_level_feature_name='low', + low_level_channels_project=128, + aspp_output_channels=96, + decoder_output_channels=64, + atrous_rates=[6, 12, 18], + num_classes=80) + ckpt_dict = deeplabv3plus_decoder.checkpoint_items + self.assertIn(common.CKPT_DEEPLABV3PLUS_ASPP, ckpt_dict) + self.assertIn(common.CKPT_DEEPLABV3PLUS_PROJECT_CONV_BN_ACT, ckpt_dict) + self.assertIn(common.CKPT_DEEPLABV3PLUS_FUSE, ckpt_dict) + self.assertIn(common.CKPT_SEMANTIC_LAST_LAYER, ckpt_dict) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/decoder/max_deeplab.py b/model/decoder/max_deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..b8c61a09a8445fe6406806bdabb4b0b932dd6f23 --- /dev/null +++ b/model/decoder/max_deeplab.py @@ -0,0 +1,328 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains code to build MaX-DeepLab output heads. + +Reference: + MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. +""" +import math + +import tensorflow as tf + +from deeplab2 import common +from deeplab2.model.decoder import panoptic_deeplab +from deeplab2.model.layers import convolutions + +_PIXEL_SPACE_FEATURE_KEY = 'pixel_space_feature' + + +def _get_transformer_class_head_num_classes( + auxiliary_semantic_head_output_channels, + ignore_label): + """Computes the num of classes for the transformer class head. + + The transformer class head predicts non-void classes (i.e., thing classes and + stuff classes) and a void (i.e., ∅, no object) class. If the auxiliary + semantic head output channel includes the void class, e.g., on COCO, we + directly use the semantic output channel. Otherwise, e.g., on Cityscapes, we + add 1 (the void class) to the transformer class head. + + Args: + auxiliary_semantic_head_output_channels: An integer, the number of output + channels of the auxiliary semantic head (it should be the same as the + num_classes field of the dataset information). + ignore_label: An integer specifying the ignore label. Default to 255. + + Returns: + num_classes: An integer, the num of classes for the transformer class head. + """ + if ignore_label >= auxiliary_semantic_head_output_channels: + return auxiliary_semantic_head_output_channels + 1 + else: + return auxiliary_semantic_head_output_channels + + +def add_bias_towards_void(transformer_class_logits, void_prior_prob=0.9): + """Adds init bias towards the void (no object) class to the class logits. + + We initialize the void class with a large probability, similar to Section 3.3 + of the Focal Loss paper. + + Reference: + Focal Loss for Dense Object Detection, ICCV 2017. + https://arxiv.org/abs/1708.02002 + Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, Piotr Dollár. + + Args: + transformer_class_logits: A [batch, num_mask_slots, num_classes] tensor, the + class logits predicted by the transformer. It concats (num_classes - 1) + non-void classes, including both thing classes and stuff classes, and the + void class (the last channel). If the dataset class IDs do not follow this + order, MaX-DeepLab loss functions will handle the mapping and thus the + architecture still supports any dataset. + void_prior_prob: A float, the desired probability (after softmax) of the + void class at initialization. Defaults to 0.9 as in MaX-DeepLab. + + Returns: + updated_transformer_class_logits: A [batch, num_mask_slots, num_classes] + + Raises: + ValueError: If the rank of transformer_class_logits is not 3. + """ + class_logits_shape = transformer_class_logits.get_shape().as_list() + if len(class_logits_shape) != 3: + raise ValueError('Input transformer_class_logits should have rank 3.') + + init_bias = [0.0] * class_logits_shape[-1] + init_bias[-1] = math.log( + (class_logits_shape[-1] - 1) * void_prior_prob / (1 - void_prior_prob)) + + # Broadcasting the 1D init_bias to the 3D transformer_class_logits. + return transformer_class_logits + tf.constant(init_bias, dtype=tf.float32) + + +def batch_norm_on_an_extra_axis(inputs, bn_layer): + """Applies a batch norm layer on an extra axis. + + This batch norm will be used on the pixel space mask logits in MaX-DeepLab to + avoid careful initialization of previous layers and careful scaling of the + resulting outputs. In addition, applying batch norm on an extra axis does not + introduce an extra gamma and beta for each mask slot. Instead, the current + gamma and beta are shared for all mask slots and do not introduce biases on + mask slots. + + Args: + inputs: A [batch, height, width, num_mask_slots] tensor. + bn_layer: A batch norm tf.keras.layers.Layer on the last axis. + + Returns: + outputs: A [batch, height, width, num_mask_slots] tensor. + """ + expanded_inputs = tf.expand_dims(inputs, axis=-1) + outputs = bn_layer(expanded_inputs) + return tf.squeeze(outputs, axis=-1) + + +class MaXDeepLab(tf.keras.layers.Layer): + """A MaX-DeepLab head layer.""" + + def __init__(self, + decoder_options, + max_deeplab_options, + ignore_label, + bn_layer=tf.keras.layers.BatchNormalization): + """Initializes a MaX-DeepLab head. + + Args: + decoder_options: Decoder options as defined in config_pb2.DecoderOptions. + max_deeplab_options: Model options as defined in + config_pb2.ModelOptions.MaXDeepLabOptions. + ignore_label: An integer specifying the ignore label. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + """ + super(MaXDeepLab, self).__init__(name='MaXDeepLab') + + low_level_feature_keys = [ + item.feature_key for item in max_deeplab_options.auxiliary_low_level + ] + low_level_channels_project = [ + item.channels_project + for item in max_deeplab_options.auxiliary_low_level + ] + + self._auxiliary_semantic_decoder = ( + panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name=decoder_options.feature_key, + low_level_feature_names=low_level_feature_keys, + low_level_channels_project=low_level_channels_project, + aspp_output_channels=decoder_options.aspp_channels, + decoder_output_channels=decoder_options.decoder_channels, + atrous_rates=decoder_options.atrous_rates, + name='auxiliary_semantic_decoder', + aspp_use_only_1x1_proj_conv=decoder_options + .aspp_use_only_1x1_proj_conv, + decoder_conv_type=decoder_options.decoder_conv_type, + bn_layer=bn_layer)) + self._auxiliary_semantic_head = panoptic_deeplab.PanopticDeepLabSingleHead( + max_deeplab_options.auxiliary_semantic_head.head_channels, + max_deeplab_options.auxiliary_semantic_head.output_channels, + common.PRED_SEMANTIC_LOGITS_KEY, + name='auxiliary_semantic_head', + conv_type=max_deeplab_options.auxiliary_semantic_head.head_conv_type, + bn_layer=bn_layer) + self._pixel_space_head = panoptic_deeplab.PanopticDeepLabSingleHead( + max_deeplab_options.pixel_space_head.head_channels, + max_deeplab_options.pixel_space_head.output_channels, + _PIXEL_SPACE_FEATURE_KEY, + name='pixel_space_head', + conv_type=max_deeplab_options.pixel_space_head.head_conv_type, + bn_layer=bn_layer) + + self._transformer_mask_head = convolutions.Conv1D( + output_channels=max_deeplab_options.pixel_space_head.output_channels, + name='transformer_mask_head', + use_bias=False, + # Use bn to avoid careful initialization. + use_bn=True, + bn_layer=bn_layer, + bn_gamma_initializer='ones', + activation=None, + kernel_initializer='he_normal', + kernel_size=1, + padding='valid') + # The transformer class head predicts non-void classes (i.e., thing classes + # and stuff classes) and a void (i.e., ∅, no object) class. + num_classes = _get_transformer_class_head_num_classes( + max_deeplab_options.auxiliary_semantic_head.output_channels, + ignore_label=ignore_label) + self._transformer_class_head = convolutions.Conv1D( + output_channels=num_classes, + name='transformer_class_head', + # Use conv bias rather than bn on this final class logit output. + use_bias=True, + use_bn=False, + activation=None, + # Follow common ImageNet class initlization with stddev 0.01. + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01), + kernel_size=1, + padding='valid') + + self._pixel_space_feature_batch_norm = bn_layer( + axis=-1, name='pixel_space_feature_batch_norm', + gamma_initializer=tf.keras.initializers.Constant(1.0)) + # Use a batch norm to avoid care initialization of the mask outputs. + self._pixel_space_mask_batch_norm = bn_layer( + axis=-1, name='pixel_space_mask_batch_norm', + # Initialize the pixel space mask with a low temperature. + gamma_initializer=tf.keras.initializers.Constant(0.1)) + + def reset_pooling_layer(self): + """Resets the ASPP pooling layers to global average pooling.""" + self._auxiliary_semantic_decoder.reset_pooling_layer() + + def set_pool_size(self, pool_size): + """Sets the pooling size of the ASPP pooling layers. + + Args: + pool_size: A tuple specifying the pooling size of the ASPP pooling layers. + """ + self._auxiliary_semantic_decoder.set_pool_size(pool_size) + + def get_pool_size(self): + return self._auxiliary_semantic_decoder.get_pool_size() + + @property + def checkpoint_items(self): + items = { + common.CKPT_SEMANTIC_DECODER: + self._auxiliary_semantic_decoder, + common.CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER: + self._auxiliary_semantic_head.conv_block, + common.CKPT_SEMANTIC_LAST_LAYER: + self._auxiliary_semantic_head.final_conv, + common.CKPT_PIXEL_SPACE_HEAD: + self._pixel_space_head, + common.CKPT_TRANSFORMER_MASK_HEAD: + self._transformer_mask_head, + common.CKPT_TRANSFORMER_CLASS_HEAD: + self._transformer_class_head, + common.CKPT_PIXEL_SPACE_FEATURE_BATCH_NORM: + self._pixel_space_feature_batch_norm, + common.CKPT_PIXEL_SPACE_MASK_BATCH_NORM: + self._pixel_space_mask_batch_norm, + } + return items + + def call(self, features, training=False): + """Performs a forward pass. + + Args: + features: An input dict of tf.Tensor with shape [batch, height, width, + channels] or [batch, length, channels]. Different keys should point to + different features extracted by the encoder, e.g., low-level or + high-level features. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + A dictionary containing the auxiliary semantic segmentation logits, the + pixel space normalized feature, the pixel space mask logits, and the + mask transformer class logits. + """ + results = {} + semantic_features = features['feature_semantic'] + panoptic_features = features['feature_panoptic'] + transformer_class_feature = features['transformer_class_feature'] + transformer_mask_feature = features['transformer_mask_feature'] + + # Auxiliary semantic head. + semantic_shape = semantic_features.get_shape().as_list() + panoptic_shape = panoptic_features.get_shape().as_list() + # MaX-DeepLab always predicts panoptic feature at high resolution (e.g., + # stride 4 or stride 2), but the auxiliary semantic feature could be at low + # resolution (e.g., stride 16 or stride 32), in the absence of the stacked + # decoder (L == 0). In this case, we use an auxiliary semantic decoder on + # top of the semantic feature, in order to add the auxiliary semantic loss. + if semantic_shape[1:3] != panoptic_shape[1:3]: + semantic_features = self._auxiliary_semantic_decoder( + features, training=training) + auxiliary_semantic_results = self._auxiliary_semantic_head( + semantic_features, training=training) + results.update(auxiliary_semantic_results) + + # Pixel space head. + pixel_space_feature = self._pixel_space_head( + panoptic_features, training=training)[_PIXEL_SPACE_FEATURE_KEY] + pixel_space_feature = self._pixel_space_feature_batch_norm( + pixel_space_feature) + pixel_space_normalized_feature = tf.math.l2_normalize( + pixel_space_feature, axis=-1) + results[common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY] = ( + pixel_space_normalized_feature) + + # Transformer class head. + transformer_class_logits = self._transformer_class_head( + transformer_class_feature) + # Bias towards the void class at initialization. + transformer_class_logits = add_bias_towards_void( + transformer_class_logits) + results[common.PRED_TRANSFORMER_CLASS_LOGITS_KEY] = transformer_class_logits + + # Transformer mask kernel. + transformer_mask_kernel = self._transformer_mask_head( + transformer_mask_feature) + + # Convolutional mask head. The pixel space mask logits are the matrix + # multiplication (or convolution) of the pixel space normalized feature and + # the transformer mask kernel. + pixel_space_mask_logits = tf.einsum( + 'bhwd,bid->bhwi', + pixel_space_normalized_feature, + transformer_mask_kernel) + # The above multiplication constructs a second-order operation which is + # sensitive to the feature scales and initializations. In order to avoid + # careful initialization or scaling of the layers, we apply batch norms on + # top of pixel_space_feature, transformer_mask_kernel, and the resulting + # pixel_space_mask_logits. + pixel_space_mask_logits = batch_norm_on_an_extra_axis( + pixel_space_mask_logits, self._pixel_space_mask_batch_norm) + results[common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY] = ( + pixel_space_mask_logits) + + return results diff --git a/model/decoder/max_deeplab_test.py b/model/decoder/max_deeplab_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b14c2b5ef8629c07b91480bcb6119829c166b584 --- /dev/null +++ b/model/decoder/max_deeplab_test.py @@ -0,0 +1,89 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for max_deeplab.""" + +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.model.decoder import max_deeplab + + +def _create_max_deeplab_example_proto(num_non_void_classes=19): + semantic_decoder = config_pb2.DecoderOptions( + feature_key='feature_semantic', atrous_rates=[6, 12, 18]) + auxiliary_semantic_head = config_pb2.HeadOptions( + output_channels=num_non_void_classes, head_channels=256) + pixel_space_head = config_pb2.HeadOptions( + output_channels=128, head_channels=256) + max_deeplab_options = config_pb2.ModelOptions.MaXDeepLabOptions( + pixel_space_head=pixel_space_head, + auxiliary_semantic_head=auxiliary_semantic_head) + # Add features from lowest to highest. + max_deeplab_options.auxiliary_low_level.add( + feature_key='res3', channels_project=64) + max_deeplab_options.auxiliary_low_level.add( + feature_key='res2', channels_project=32) + return config_pb2.ModelOptions( + decoder=semantic_decoder, max_deeplab=max_deeplab_options) + + +class MaXDeeplabTest(tf.test.TestCase): + + def test_max_deeplab_decoder_output_shape(self): + num_non_void_classes = 19 + num_mask_slots = 127 + model_options = _create_max_deeplab_example_proto( + num_non_void_classes=num_non_void_classes) + decoder = max_deeplab.MaXDeepLab( + max_deeplab_options=model_options.max_deeplab, + ignore_label=255, + decoder_options=model_options.decoder) + + input_dict = { + 'res2': + tf.random.uniform([2, 17, 17, 256]), + 'res3': + tf.random.uniform([2, 9, 9, 512]), + 'transformer_class_feature': + tf.random.uniform([2, num_mask_slots, 256]), + 'transformer_mask_feature': + tf.random.uniform([2, num_mask_slots, 256]), + 'feature_panoptic': + tf.random.uniform([2, 17, 17, 256]), + 'feature_semantic': + tf.random.uniform([2, 5, 5, 2048]) + } + resulting_dict = decoder(input_dict) + self.assertListEqual( + resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(), + [2, 17, 17, 19]) # Stride 4 + self.assertListEqual( + resulting_dict[ + common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY].shape.as_list(), + [2, 17, 17, 128]) # Stride 4 + self.assertListEqual( + resulting_dict[ + common.PRED_TRANSFORMER_CLASS_LOGITS_KEY].shape.as_list(), + # Non-void classes and a void class. + [2, num_mask_slots, num_non_void_classes + 1]) + self.assertListEqual( + resulting_dict[common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY].shape.as_list(), + [2, 17, 17, num_mask_slots]) # Stride 4. + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/decoder/motion_deeplab_decoder.py b/model/decoder/motion_deeplab_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..6b84c79a56c246b41b0a01e9f3abe3ee0fdfa218 --- /dev/null +++ b/model/decoder/motion_deeplab_decoder.py @@ -0,0 +1,216 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains the code for the Motion-DeepLab decoder.""" + +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.model.decoder import panoptic_deeplab + + +class MotionDeepLabDecoder(tf.keras.layers.Layer): + """A Motion-DeepLab decoder layer. + + This layer takes low- and high-level features as input and uses a dual-ASPP + and dual-decoder structure to aggregate features for semantic and instance + segmentation. On top of the decoders, four heads are used to predict semantic + segmentation, instance center probabilities, instance center regression, and + previous frame offset regression per pixel. + """ + + def __init__( + self, + decoder_options: config_pb2.DecoderOptions, + motion_deeplab_options: config_pb2.ModelOptions.MotionDeepLabOptions, + bn_layer=tf.keras.layers.BatchNormalization): + """Initializes a Motion-DeepLab decoder. + + Args: + decoder_options: Decoder options as defined in config_pb2.DecoderOptions. + motion_deeplab_options: Model options as defined in + config_pb2.ModelOptions.MotionDeeplabOptions. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + """ + super(MotionDeepLabDecoder, self).__init__(name='MotionDeepLabDecoder') + + low_level_feature_keys = [ + item.feature_key for item in motion_deeplab_options.low_level + ] + low_level_channels_project = [ + item.channels_project for item in motion_deeplab_options.low_level + ] + + self._semantic_decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder( + decoder_options.feature_key, + low_level_feature_keys, + low_level_channels_project, + decoder_options.aspp_channels, + decoder_options.decoder_channels, + decoder_options.atrous_rates, + name='semantic_decoder', + bn_layer=bn_layer) + self._semantic_head = panoptic_deeplab.PanopticDeepLabSingleHead( + motion_deeplab_options.semantic_head.head_channels, + motion_deeplab_options.semantic_head.output_channels, + common.PRED_SEMANTIC_LOGITS_KEY, + name='semantic_head', + bn_layer=bn_layer) + + self._instance_decoder = None + self._instance_center_head = None + self._instance_regression_head = None + self._motion_regression_head = None + + if motion_deeplab_options.instance.low_level_override: + low_level_options = motion_deeplab_options.instance.low_level_override + else: + low_level_options = motion_deeplab_options.low_level + + # If instance_decoder is set, use those options; otherwise reuse the + # architecture as defined for the semantic decoder. + if motion_deeplab_options.instance.HasField('instance_decoder_override'): + decoder_options = (motion_deeplab_options.instance + .instance_decoder_override) + + low_level_feature_keys = [item.feature_key for item in low_level_options] + low_level_channels_project = [ + item.channels_project for item in low_level_options + ] + + self._instance_decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder( + decoder_options.feature_key, + low_level_feature_keys, + low_level_channels_project, + decoder_options.aspp_channels, + decoder_options.decoder_channels, + decoder_options.atrous_rates, + name='instance_decoder', + bn_layer=bn_layer) + self._instance_center_head = panoptic_deeplab.PanopticDeepLabSingleHead( + motion_deeplab_options.instance.center_head.head_channels, + motion_deeplab_options.instance.center_head.output_channels, + common.PRED_CENTER_HEATMAP_KEY, + name='instance_center_head', + bn_layer=bn_layer) + self._instance_regression_head = panoptic_deeplab.PanopticDeepLabSingleHead( + motion_deeplab_options.instance.regression_head.head_channels, + motion_deeplab_options.instance.regression_head.output_channels, + common.PRED_OFFSET_MAP_KEY, + name='instance_regression_head', + bn_layer=bn_layer) + + # The motion head regresses every pixel to its center in the previous + # frame. + self._motion_regression_head = panoptic_deeplab.PanopticDeepLabSingleHead( + motion_deeplab_options.motion_head.head_channels, + motion_deeplab_options.motion_head.output_channels, + common.PRED_FRAME_OFFSET_MAP_KEY, + name='motion_regression_head', + bn_layer=bn_layer) + + def reset_pooling_layer(self): + """Resets the ASPP pooling layers to global average pooling.""" + self._semantic_decoder.reset_pooling_layer() + if self._instance_decoder is not None: + self._instance_decoder.reset_pooling_layer() + + def set_pool_size(self, pool_size): + """Sets the pooling size of the ASPP pooling layers. + + Args: + pool_size: A tuple specifying the pooling size of the ASPP pooling layers. + """ + self._semantic_decoder.set_pool_size(pool_size) + if self._instance_decoder is not None: + self._instance_decoder.set_pool_size(pool_size) + + def get_pool_size(self): + return self._semantic_decoder.get_pool_size() + + def call(self, features, training=False): + """Performs a forward pass. + + Args: + features: An input dict of tf.Tensor with shape [batch, height, width, + channels]. Different keys should point to different features extracted + by the encoder, e.g. low-level or high-level features. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + A dictionary containing the results of the semantic segmentation head and + depending on the configuration also of the instance segmentation head. + """ + + semantic_features = self._semantic_decoder(features, training=training) + results = self._semantic_head(semantic_features, training=training) + + if self._instance_decoder is not None: + instance_features = self._instance_decoder(features, training=training) + instance_center_predictions = self._instance_center_head( + instance_features, training=training) + instance_regression_predictions = self._instance_regression_head( + instance_features, training=training) + motion_regression_predictions = self._motion_regression_head( + instance_features, training=training) + if results.keys() & motion_regression_predictions.keys(): + raise ValueError('The keys of the semantic branch and the instance ' + 'motion branch overlap. Please use unique keys.') + results.update(motion_regression_predictions) + + if results.keys() & instance_center_predictions.keys(): + raise ValueError('The keys of the semantic branch and the instance ' + 'center branch overlap. Please use unique keys.') + results.update(instance_center_predictions) + + if results.keys() & instance_regression_predictions.keys(): + raise ValueError('The keys of the semantic branch and the instance ' + 'regression branch overlap. Please use unique keys.') + results.update(instance_regression_predictions) + + return results + + @property + def checkpoint_items(self): + items = { + common.CKPT_SEMANTIC_DECODER: + self._semantic_decoder, + common.CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER: + self._semantic_head.conv_block, + common.CKPT_SEMANTIC_LAST_LAYER: + self._semantic_head.final_conv + } + if self._instance_decoder is not None: + instance_items = { + common.CKPT_INSTANCE_DECODER: + self._instance_decoder, + common.CKPT_INSTANCE_CENTER_HEAD_WITHOUT_LAST_LAYER: + self._instance_center_head.conv_block, + common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER: + self._instance_center_head.final_conv, + common.CKPT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER: + self._instance_regression_head.conv_block, + common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER: + self._instance_regression_head.final_conv, + common.CKPT_MOTION_REGRESSION_HEAD_WITHOUT_LAST_LAYER: + self._motion_regression_head.conv_block, + common.CKPT_MOTION_REGRESSION_HEAD_LAST_LAYER: + self._motion_regression_head.final_conv, + } + items.update(instance_items) + return items diff --git a/model/decoder/panoptic_deeplab.py b/model/decoder/panoptic_deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..4ccbeaff5f49789f93188ec03f49eeec06bbe0b2 --- /dev/null +++ b/model/decoder/panoptic_deeplab.py @@ -0,0 +1,445 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains code to build a Panoptic-DeepLab decoder. + +Reference: + - [Panoptic-DeepLab: A Simple, Strong, and Fast Baseline for Bottom-Up + Panoptic Segmentation](https://arxiv.org/pdf/1911.10194) +""" +from absl import logging + +import tensorflow as tf + +from deeplab2 import common +from deeplab2.model import utils +from deeplab2.model.decoder import aspp +from deeplab2.model.layers import convolutions + + +layers = tf.keras.layers + + +class PanopticDeepLabSingleDecoder(layers.Layer): + """A single Panoptic-DeepLab decoder layer. + + This layer takes low- and high-level features as input and uses an ASPP + followed by a fusion block to decode features for a single task, e.g., + semantic segmentation or instance segmentation. + """ + + def __init__(self, + high_level_feature_name, + low_level_feature_names, + low_level_channels_project, + aspp_output_channels, + decoder_output_channels, + atrous_rates, + name, + aspp_use_only_1x1_proj_conv=False, + decoder_conv_type='depthwise_separable_conv', + bn_layer=tf.keras.layers.BatchNormalization): + """Initializes a single Panoptic-DeepLab decoder of layers.Layer. + + Args: + high_level_feature_name: A string specifying the name of the high-level + feature coming from an encoder. + low_level_feature_names: A list of strings specifying the name of the + low-level features coming from an encoder. An order from highest to + lower level is expected, e.g. ['res3', 'res2']. + low_level_channels_project: A list of integer specifying the number of + filters used for processing each low_level features. + aspp_output_channels: An integer specifying the number of filters in the + ASPP convolution layers. + decoder_output_channels: An integer specifying the number of filters in + the decoder convolution layers. + atrous_rates: A list of three integers specifying the atrous rate for the + ASPP layers. + name: A string specifying the name of the layer. + aspp_use_only_1x1_proj_conv: Boolean, specifying if the ASPP five branches + are turned off or not. If True, the ASPP module is degenerated to one + 1x1 convolution, projecting the input channels to `output_channels`. + decoder_conv_type: String, specifying decoder convolution type. Support + 'depthwise_separable_conv' and 'standard_conv'. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + + Raises: + ValueError: An error occurs when the length of low_level_feature_names + differs from the length of low_level_channels_project. + """ + super(PanopticDeepLabSingleDecoder, self).__init__(name=name) + self._channel_axis = 3 + + self._aspp = aspp.ASPP( + aspp_output_channels, + atrous_rates, + aspp_use_only_1x1_proj_conv=aspp_use_only_1x1_proj_conv, + name='aspp', + bn_layer=bn_layer) + self._high_level_feature_name = high_level_feature_name + + if len(low_level_feature_names) != len(low_level_channels_project): + raise ValueError('The Panoptic-DeepLab decoder requires the same number ' + 'of low-level features as the number of low-level ' + 'projection channels. But got %d and %d.' + % (len(low_level_feature_names), + len(low_level_channels_project))) + + self._low_level_feature_names = low_level_feature_names + + for i, channels_project in enumerate(low_level_channels_project): + # Check if channel sizes increases and issue a warning. + if i > 0 and low_level_channels_project[i - 1] < channels_project: + logging.warning( + 'The low level projection channels usually do not ' + 'increase for features with higher spatial resolution. ' + 'Please make sure, this behavior is intended.') + current_low_level_conv_name, current_fusion_conv_name = ( + utils.get_low_level_conv_fusion_conv_current_names(i)) + utils.safe_setattr( + self, current_low_level_conv_name, convolutions.Conv2DSame( + channels_project, + kernel_size=1, + name=utils.get_layer_name(current_low_level_conv_name), + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='relu')) + + utils.safe_setattr( + self, current_fusion_conv_name, convolutions.StackedConv2DSame( + conv_type=decoder_conv_type, + num_layers=1, + output_channels=decoder_output_channels, + kernel_size=5, + name=utils.get_layer_name(current_fusion_conv_name), + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='relu')) + + def call(self, features, training=False): + """Performs a forward pass. + + Args: + features: An input dict of tf.Tensor with shape [batch, height, width, + channels]. Different keys should point to different features extracted + by the encoder, e.g. low-level or high-level features. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + Refined features as instance of tf.Tensor. + """ + + high_level_features = features[self._high_level_feature_name] + combined_features = self._aspp(high_level_features, training=training) + + # Fuse low-level features with high-level features. + for i in range(len(self._low_level_feature_names)): + current_low_level_conv_name, current_fusion_conv_name = ( + utils.get_low_level_conv_fusion_conv_current_names(i)) + # Iterate from the highest level of the low level features to the lowest + # level, i.e. take the features with the smallest spatial size first. + low_level_features = features[self._low_level_feature_names[i]] + low_level_features = getattr(self, current_low_level_conv_name)( + low_level_features, training=training) + + target_h = tf.shape(low_level_features)[1] + target_w = tf.shape(low_level_features)[2] + source_h = tf.shape(combined_features)[1] + source_w = tf.shape(combined_features)[2] + + tf.assert_less( + source_h - 1, + target_h, + message='Features are down-sampled during decoder.') + tf.assert_less( + source_w - 1, + target_w, + message='Features are down-sampled during decoder.') + + combined_features = utils.resize_align_corners(combined_features, + [target_h, target_w]) + + combined_features = tf.concat([combined_features, low_level_features], + self._channel_axis) + combined_features = getattr(self, current_fusion_conv_name)( + combined_features, training=training) + + return combined_features + + def reset_pooling_layer(self): + """Resets the ASPP pooling layer to global average pooling.""" + self._aspp.reset_pooling_layer() + + def set_pool_size(self, pool_size): + """Sets the pooling size of the ASPP pooling layer. + + Args: + pool_size: A tuple specifying the pooling size of the ASPP pooling layer. + """ + self._aspp.set_pool_size(pool_size) + + def get_pool_size(self): + return self._aspp.get_pool_size() + + +class PanopticDeepLabSingleHead(layers.Layer): + """A single PanopticDeepLab head layer. + + This layer takes in the enriched features from a decoder and adds two + convolutions on top. + """ + + def __init__(self, + intermediate_channels, + output_channels, + pred_key, + name, + conv_type='depthwise_separable_conv', + bn_layer=tf.keras.layers.BatchNormalization): + """Initializes a single PanopticDeepLab head. + + Args: + intermediate_channels: An integer specifying the number of filters of the + first 5x5 convolution. + output_channels: An integer specifying the number of filters of the second + 1x1 convolution. + pred_key: A string specifying the key of the output dictionary. + name: A string specifying the name of this head. + conv_type: String, specifying head convolution type. Support + 'depthwise_separable_conv' and 'standard_conv'. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + """ + super(PanopticDeepLabSingleHead, self).__init__(name=name) + self._pred_key = pred_key + + self.conv_block = convolutions.StackedConv2DSame( + conv_type=conv_type, + num_layers=1, + output_channels=intermediate_channels, + kernel_size=5, + name='conv_block', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='relu') + self.final_conv = layers.Conv2D( + output_channels, + kernel_size=1, + name='final_conv', + kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01)) + + def call(self, features, training=False): + """Performs a forward pass. + + Args: + features: A tf.Tensor with shape [batch, height, width, channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + The dictionary containing the predictions under the specified key. + """ + x = self.conv_block(features, training=training) + return {self._pred_key: self.final_conv(x)} + + +class PanopticDeepLab(layers.Layer): + """A Panoptic-DeepLab decoder layer. + + This layer takes low- and high-level features as input and uses a dual-ASPP + and dual-decoder structure to aggregate features for semantic and instance + segmentation. On top of the decoders, three heads are used to predict semantic + segmentation, instance center probabilities, and instance center regression + per pixel. + """ + + def __init__(self, + decoder_options, + panoptic_deeplab_options, + bn_layer=tf.keras.layers.BatchNormalization): + """Initializes a Panoptic-DeepLab decoder. + + Args: + decoder_options: Decoder options as defined in config_pb2.DecoderOptions. + panoptic_deeplab_options: Model options as defined in + config_pb2.ModelOptions.PanopticDeeplabOptions. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + """ + super(PanopticDeepLab, self).__init__(name='PanopticDeepLab') + + low_level_feature_keys = [ + item.feature_key for item in panoptic_deeplab_options.low_level + ] + low_level_channels_project = [ + item.channels_project for item in panoptic_deeplab_options.low_level + ] + + self._semantic_decoder = PanopticDeepLabSingleDecoder( + high_level_feature_name=decoder_options.feature_key, + low_level_feature_names=low_level_feature_keys, + low_level_channels_project=low_level_channels_project, + aspp_output_channels=decoder_options.aspp_channels, + decoder_output_channels=decoder_options.decoder_channels, + atrous_rates=decoder_options.atrous_rates, + name='semantic_decoder', + aspp_use_only_1x1_proj_conv=decoder_options.aspp_use_only_1x1_proj_conv, + decoder_conv_type=decoder_options.decoder_conv_type, + bn_layer=bn_layer) + self._semantic_head = PanopticDeepLabSingleHead( + panoptic_deeplab_options.semantic_head.head_channels, + panoptic_deeplab_options.semantic_head.output_channels, + common.PRED_SEMANTIC_LOGITS_KEY, + name='semantic_head', + conv_type=panoptic_deeplab_options.semantic_head.head_conv_type, + bn_layer=bn_layer) + + self._instance_decoder = None + self._instance_center_head = None + self._instance_regression_head = None + + if panoptic_deeplab_options.instance.enable: + if panoptic_deeplab_options.instance.low_level_override: + low_level_options = panoptic_deeplab_options.instance.low_level_override + else: + low_level_options = panoptic_deeplab_options.low_level + + # If instance_decoder is set, use those options; otherwise reuse the + # architecture as defined for the semantic decoder. + if panoptic_deeplab_options.instance.HasField( + 'instance_decoder_override'): + decoder_options = (panoptic_deeplab_options.instance + .instance_decoder_override) + + low_level_feature_keys = [item.feature_key for item in low_level_options] + low_level_channels_project = [ + item.channels_project for item in low_level_options + ] + + self._instance_decoder = PanopticDeepLabSingleDecoder( + high_level_feature_name=decoder_options.feature_key, + low_level_feature_names=low_level_feature_keys, + low_level_channels_project=low_level_channels_project, + aspp_output_channels=decoder_options.aspp_channels, + decoder_output_channels=decoder_options.decoder_channels, + atrous_rates=decoder_options.atrous_rates, + name='instance_decoder', + aspp_use_only_1x1_proj_conv=( + decoder_options.aspp_use_only_1x1_proj_conv), + decoder_conv_type=decoder_options.decoder_conv_type, + bn_layer=bn_layer) + self._instance_center_head = PanopticDeepLabSingleHead( + panoptic_deeplab_options.instance.center_head.head_channels, + panoptic_deeplab_options.instance.center_head.output_channels, + common.PRED_CENTER_HEATMAP_KEY, + name='instance_center_head', + conv_type=( + panoptic_deeplab_options.instance.center_head.head_conv_type), + bn_layer=bn_layer) + self._instance_regression_head = PanopticDeepLabSingleHead( + panoptic_deeplab_options.instance.regression_head.head_channels, + panoptic_deeplab_options.instance.regression_head.output_channels, + common.PRED_OFFSET_MAP_KEY, + name='instance_regression_head', + conv_type=( + panoptic_deeplab_options.instance.regression_head.head_conv_type), + bn_layer=bn_layer) + + def reset_pooling_layer(self): + """Resets the ASPP pooling layers to global average pooling.""" + self._semantic_decoder.reset_pooling_layer() + if self._instance_decoder is not None: + self._instance_decoder.reset_pooling_layer() + + def set_pool_size(self, pool_size): + """Sets the pooling size of the ASPP pooling layers. + + Args: + pool_size: A tuple specifying the pooling size of the ASPP pooling layers. + """ + self._semantic_decoder.set_pool_size(pool_size) + if self._instance_decoder is not None: + self._instance_decoder.set_pool_size(pool_size) + + def get_pool_size(self): + return self._semantic_decoder.get_pool_size() + + @property + def checkpoint_items(self): + items = { + common.CKPT_SEMANTIC_DECODER: + self._semantic_decoder, + common.CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER: + self._semantic_head.conv_block, + common.CKPT_SEMANTIC_LAST_LAYER: + self._semantic_head.final_conv + } + if self._instance_decoder is not None: + instance_items = { + common.CKPT_INSTANCE_DECODER: + self._instance_decoder, + common.CKPT_INSTANCE_CENTER_HEAD_WITHOUT_LAST_LAYER: + self._instance_center_head.conv_block, + common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER: + self._instance_center_head.final_conv, + common.CKPT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER: + self._instance_regression_head.conv_block, + common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER: + self._instance_regression_head.final_conv, + } + items.update(instance_items) + return items + + def call(self, features, training=False): + """Performs a forward pass. + + Args: + features: An input dict of tf.Tensor with shape [batch, height, width, + channels]. Different keys should point to different features extracted + by the encoder, e.g. low-level or high-level features. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + A dictionary containing the results of the semantic segmentation head and + depending on the configuration also of the instance segmentation head. + """ + + semantic_features = self._semantic_decoder(features, training=training) + results = self._semantic_head(semantic_features, training=training) + + if self._instance_decoder is not None: + instance_features = self._instance_decoder(features, training=training) + instance_center_predictions = self._instance_center_head( + instance_features, training=training) + instance_regression_predictions = self._instance_regression_head( + instance_features, training=training) + + if results.keys() & instance_center_predictions.keys(): + raise ValueError('The keys of the semantic branch and the instance ' + 'center branch overlap. Please use unique keys.') + results.update(instance_center_predictions) + + if results.keys() & instance_regression_predictions.keys(): + raise ValueError('The keys of the semantic branch and the instance ' + 'regression branch overlap. Please use unique keys.') + results.update(instance_regression_predictions) + + return results diff --git a/model/decoder/panoptic_deeplab_test.py b/model/decoder/panoptic_deeplab_test.py new file mode 100644 index 0000000000000000000000000000000000000000..ff5a8bf69e542371618dbcc17fe9d17c63f0e1be --- /dev/null +++ b/model/decoder/panoptic_deeplab_test.py @@ -0,0 +1,267 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for panoptic_deeplab.""" + +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.model.decoder import panoptic_deeplab +from deeplab2.utils import test_utils + + +def _create_panoptic_deeplab_example_proto(num_classes=19): + semantic_decoder = config_pb2.DecoderOptions( + feature_key='res5', atrous_rates=[6, 12, 18]) + semantic_head = config_pb2.HeadOptions( + output_channels=num_classes, head_channels=256) + + instance_decoder = config_pb2.DecoderOptions( + feature_key='res5', decoder_channels=128, atrous_rates=[6, 12, 18]) + center_head = config_pb2.HeadOptions( + output_channels=1, head_channels=32) + regression_head = config_pb2.HeadOptions( + output_channels=2, head_channels=32) + + instance_branch = config_pb2.InstanceOptions( + instance_decoder_override=instance_decoder, + center_head=center_head, + regression_head=regression_head) + + panoptic_deeplab_options = config_pb2.ModelOptions.PanopticDeeplabOptions( + semantic_head=semantic_head, instance=instance_branch) + # Add features from lowest to highest. + panoptic_deeplab_options.low_level.add( + feature_key='res3', channels_project=64) + panoptic_deeplab_options.low_level.add( + feature_key='res2', channels_project=32) + + return config_pb2.ModelOptions( + decoder=semantic_decoder, panoptic_deeplab=panoptic_deeplab_options) + + +def _create_expected_shape(input_shape, output_channels): + output_shape = input_shape.copy() + output_shape[3] = output_channels + return output_shape + + +class PanopticDeeplabTest(tf.test.TestCase): + + def test_panoptic_deeplab_single_decoder_init_errors(self): + with self.assertRaises(ValueError): + _ = panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name='test', + low_level_feature_names=['only_one_name'], # Error: Only one name. + low_level_channels_project=[64, 32], + aspp_output_channels=256, + decoder_output_channels=256, + atrous_rates=[6, 12, 18], + name='test_decoder') + + with self.assertRaises(ValueError): + _ = panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name='test', + low_level_feature_names=['one', 'two'], + low_level_channels_project=[64], # Error: Only one projection size. + aspp_output_channels=256, + decoder_output_channels=256, + atrous_rates=[6, 12, 18], + name='test_decoder') + + def test_panoptic_deeplab_single_decoder_call_errors(self): + decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name='high', + low_level_feature_names=['low_one', 'low_two'], + low_level_channels_project=[64, 32], + aspp_output_channels=256, + decoder_output_channels=256, + atrous_rates=[6, 12, 18], + name='test_decoder') + + with self.assertRaises(KeyError): + input_dict = {'not_high': tf.random.uniform(shape=(2, 32, 32, 512)), + 'low_one': tf.random.uniform(shape=(2, 128, 128, 128)), + 'low_two': tf.random.uniform(shape=(2, 256, 256, 64))} + _ = decoder(input_dict) + with self.assertRaises(KeyError): + input_dict = {'high': tf.random.uniform(shape=(2, 32, 32, 512)), + 'not_low_one': tf.random.uniform(shape=(2, 128, 128, 128)), + 'low_two': tf.random.uniform(shape=(2, 256, 256, 64))} + _ = decoder(input_dict) + with self.assertRaises(KeyError): + input_dict = {'high': tf.random.uniform(shape=(2, 32, 32, 512)), + 'low_one': tf.random.uniform(shape=(2, 128, 128, 128)), + 'not_low_two': tf.random.uniform(shape=(2, 256, 256, 64))} + _ = decoder(input_dict) + + def test_panoptic_deeplab_single_decoder_reset_pooling(self): + decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name='high', + low_level_feature_names=['low_one', 'low_two'], + low_level_channels_project=[64, 32], + aspp_output_channels=256, + decoder_output_channels=256, + atrous_rates=[6, 12, 18], + name='test_decoder') + pool_size = (None, None) + decoder.reset_pooling_layer() + + self.assertTupleEqual(decoder._aspp._aspp_pool._pool_size, + pool_size) + + def test_panoptic_deeplab_single_decoder_set_pooling(self): + decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name='high', + low_level_feature_names=['low_one', 'low_two'], + low_level_channels_project=[64, 32], + aspp_output_channels=256, + decoder_output_channels=256, + atrous_rates=[6, 12, 18], + name='test_decoder') + + pool_size = (10, 10) + decoder.set_pool_size(pool_size) + + self.assertTupleEqual(decoder._aspp._aspp_pool._pool_size, + pool_size) + + def test_panoptic_deeplab_single_decoder_output_shape(self): + decoder_channels = 256 + decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name='high', + low_level_feature_names=['low_one', 'low_two'], + low_level_channels_project=[64, 32], + aspp_output_channels=256, + decoder_output_channels=decoder_channels, + atrous_rates=[6, 12, 18], + name='test_decoder') + + input_shapes_list = [[[2, 128, 128, 128], [2, 256, 256, 64], + [2, 32, 32, 512]], + [[2, 129, 129, 128], [2, 257, 257, 64], + [2, 33, 33, 512]]] + + for shapes in input_shapes_list: + input_dict = {'low_one': tf.random.uniform(shape=shapes[0]), + 'low_two': tf.random.uniform(shape=shapes[1]), + 'high': tf.random.uniform(shape=shapes[2])} + + expected_shape = _create_expected_shape(shapes[1], decoder_channels) + + resulting_tensor = decoder(input_dict) + self.assertListEqual(resulting_tensor.shape.as_list(), expected_shape) + + def test_panoptic_deeplab_single_head_output_shape(self): + output_channels = 19 + head = panoptic_deeplab.PanopticDeepLabSingleHead( + intermediate_channels=256, + output_channels=output_channels, + pred_key='pred', + name='test_head') + + input_shapes_list = [[2, 256, 256, 48], [2, 257, 257, 48]] + for shape in input_shapes_list: + input_tensor = tf.random.uniform(shape=shape) + expected_shape = _create_expected_shape(shape, output_channels) + + resulting_tensor = head(input_tensor) + self.assertListEqual(resulting_tensor['pred'].shape.as_list(), + expected_shape) + + def test_panoptic_deeplab_decoder_output_shape(self): + num_classes = 31 + model_options = _create_panoptic_deeplab_example_proto( + num_classes=num_classes) + decoder = panoptic_deeplab.PanopticDeepLab( + panoptic_deeplab_options=model_options.panoptic_deeplab, + decoder_options=model_options.decoder) + + input_shapes_list = [[[2, 256, 256, 64], [2, 128, 128, 128], + [2, 32, 32, 512]], + [[2, 257, 257, 64], [2, 129, 129, 128], + [2, 33, 33, 512]]] + + for shapes in input_shapes_list: + input_dict = {'res2': tf.random.uniform(shape=shapes[0]), + 'res3': tf.random.uniform(shape=shapes[1]), + 'res5': tf.random.uniform(shape=shapes[2])} + + expected_semantic_shape = _create_expected_shape(shapes[0], num_classes) + expected_instance_center_shape = _create_expected_shape(shapes[0], 1) + expected_instance_regression_shape = _create_expected_shape(shapes[0], 2) + + resulting_dict = decoder(input_dict) + self.assertListEqual( + resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(), + expected_semantic_shape) + self.assertListEqual( + resulting_dict[common.PRED_CENTER_HEATMAP_KEY].shape.as_list(), + expected_instance_center_shape) + self.assertListEqual( + resulting_dict[common.PRED_OFFSET_MAP_KEY].shape.as_list(), + expected_instance_regression_shape) + + @test_utils.test_all_strategies + def test_panoptic_deeplab_sync_bn(self, strategy): + num_classes = 31 + model_options = _create_panoptic_deeplab_example_proto( + num_classes=num_classes) + input_dict = {'res2': tf.random.uniform(shape=[2, 257, 257, 64]), + 'res3': tf.random.uniform(shape=[2, 129, 129, 128]), + 'res5': tf.random.uniform(shape=[2, 33, 33, 512])} + + with strategy.scope(): + for bn_layer in test_utils.NORMALIZATION_LAYERS: + decoder = panoptic_deeplab.PanopticDeepLab( + panoptic_deeplab_options=model_options.panoptic_deeplab, + decoder_options=model_options.decoder, + bn_layer=bn_layer) + _ = decoder(input_dict) + + def test_panoptic_deeplab_single_decoder_logging_feature_order(self): + with self.assertLogs(level='WARN'): + _ = panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name='high', + low_level_feature_names=['low_two', 'low_one'], + low_level_channels_project=[32, 64], # Potentially wrong order. + aspp_output_channels=256, + decoder_output_channels=256, + atrous_rates=[6, 12, 18], + name='test_decoder') + + def test_panoptic_deeplab_decoder_ckpt_tems(self): + num_classes = 31 + model_options = _create_panoptic_deeplab_example_proto( + num_classes=num_classes) + decoder = panoptic_deeplab.PanopticDeepLab( + panoptic_deeplab_options=model_options.panoptic_deeplab, + decoder_options=model_options.decoder) + ckpt_dict = decoder.checkpoint_items + self.assertIn(common.CKPT_SEMANTIC_DECODER, ckpt_dict) + self.assertIn(common.CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER, ckpt_dict) + self.assertIn(common.CKPT_SEMANTIC_LAST_LAYER, ckpt_dict) + self.assertIn(common.CKPT_INSTANCE_DECODER, ckpt_dict) + self.assertIn(common.CKPT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER, + ckpt_dict) + self.assertIn(common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER, ckpt_dict) + self.assertIn(common.CKPT_INSTANCE_CENTER_HEAD_WITHOUT_LAST_LAYER, + ckpt_dict) + self.assertIn(common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER, ckpt_dict) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/decoder/vip_deeplab_decoder.py b/model/decoder/vip_deeplab_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..a12bb1ca4e9ba75b6f4b2a275beb838f0376f863 --- /dev/null +++ b/model/decoder/vip_deeplab_decoder.py @@ -0,0 +1,279 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains code to build a ViP-DeepLab decoder. + +Reference: + - [ViP-DeepLab: Learning Visual Perception with Depth-aware Video + Panoptic Segmentation](https://arxiv.org/abs/2012.05258) +""" +import tensorflow as tf + +from deeplab2 import common +from deeplab2.model.decoder import panoptic_deeplab + + +layers = tf.keras.layers + + +class ViPDeepLabDecoder(layers.Layer): + """A ViP-DeepLab decoder layer. + + This layer takes low- and high-level features as input and uses a dual-ASPP + and dual-decoder structure to aggregate features for semantic and instance + segmentation. On top of the decoders, three heads are used to predict semantic + segmentation, instance center probabilities, and instance center regression + per pixel. It also has a branch to predict the next-frame instance center + regression. Different from the ViP-DeepLab paper which uses Cascade-ASPP, this + reimplementation only uses ASPP. + """ + + def __init__(self, + decoder_options, + vip_deeplab_options, + bn_layer=tf.keras.layers.BatchNormalization): + """Initializes a ViP-DeepLab decoder. + + Args: + decoder_options: Decoder options as defined in config_pb2.DecoderOptions. + vip_deeplab_options: Model options as defined in + config_pb2.ModelOptions.ViPDeeplabOptions. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + """ + super(ViPDeepLabDecoder, self).__init__(name='ViPDeepLab') + + low_level_feature_keys = [ + item.feature_key for item in vip_deeplab_options.low_level + ] + low_level_channels_project = [ + item.channels_project for item in vip_deeplab_options.low_level + ] + + self._semantic_decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name=decoder_options.feature_key, + low_level_feature_names=low_level_feature_keys, + low_level_channels_project=low_level_channels_project, + aspp_output_channels=decoder_options.aspp_channels, + decoder_output_channels=decoder_options.decoder_channels, + atrous_rates=decoder_options.atrous_rates, + name='semantic_decoder', + aspp_use_only_1x1_proj_conv=decoder_options.aspp_use_only_1x1_proj_conv, + decoder_conv_type=decoder_options.decoder_conv_type, + bn_layer=bn_layer) + self._semantic_head = panoptic_deeplab.PanopticDeepLabSingleHead( + vip_deeplab_options.semantic_head.head_channels, + vip_deeplab_options.semantic_head.output_channels, + common.PRED_SEMANTIC_LOGITS_KEY, + name='semantic_head', + conv_type=vip_deeplab_options.semantic_head.head_conv_type, + bn_layer=bn_layer) + + self._instance_decoder = None + self._instance_center_head = None + self._instance_regression_head = None + self._next_instance_decoder = None + self._next_instance_regression_head = None + + if vip_deeplab_options.instance.enable: + if vip_deeplab_options.instance.low_level_override: + low_level_options = vip_deeplab_options.instance.low_level_override + else: + low_level_options = vip_deeplab_options.low_level + + # If instance_decoder is set, use those options; otherwise reuse the + # architecture as defined for the semantic decoder. + if vip_deeplab_options.instance.HasField( + 'instance_decoder_override'): + decoder_options = (vip_deeplab_options.instance + .instance_decoder_override) + + low_level_feature_keys = [item.feature_key for item in low_level_options] + low_level_channels_project = [ + item.channels_project for item in low_level_options + ] + + self._instance_decoder = panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name=decoder_options.feature_key, + low_level_feature_names=low_level_feature_keys, + low_level_channels_project=low_level_channels_project, + aspp_output_channels=decoder_options.aspp_channels, + decoder_output_channels=decoder_options.decoder_channels, + atrous_rates=decoder_options.atrous_rates, + name='instance_decoder', + aspp_use_only_1x1_proj_conv=( + decoder_options.aspp_use_only_1x1_proj_conv), + decoder_conv_type=decoder_options.decoder_conv_type, + bn_layer=bn_layer) + self._instance_center_head = panoptic_deeplab.PanopticDeepLabSingleHead( + vip_deeplab_options.instance.center_head.head_channels, + vip_deeplab_options.instance.center_head.output_channels, + common.PRED_CENTER_HEATMAP_KEY, + name='instance_center_head', + conv_type=( + vip_deeplab_options.instance.center_head.head_conv_type), + bn_layer=bn_layer) + self._instance_regression_head = ( + panoptic_deeplab.PanopticDeepLabSingleHead( + vip_deeplab_options.instance.regression_head.head_channels, + vip_deeplab_options.instance.regression_head.output_channels, + common.PRED_OFFSET_MAP_KEY, + name='instance_regression_head', + conv_type=( + vip_deeplab_options.instance.regression_head.head_conv_type), + bn_layer=bn_layer)) + + if vip_deeplab_options.instance.HasField('next_regression_head'): + self._next_instance_decoder = ( + panoptic_deeplab.PanopticDeepLabSingleDecoder( + high_level_feature_name=decoder_options.feature_key, + low_level_feature_names=low_level_feature_keys, + low_level_channels_project=low_level_channels_project, + aspp_output_channels=decoder_options.aspp_channels, + decoder_output_channels=decoder_options.decoder_channels, + atrous_rates=decoder_options.atrous_rates, + name='next_instance_decoder', + aspp_use_only_1x1_proj_conv=( + decoder_options.aspp_use_only_1x1_proj_conv), + decoder_conv_type=decoder_options.decoder_conv_type, + bn_layer=bn_layer)) + self._next_instance_regression_head = ( + panoptic_deeplab.PanopticDeepLabSingleHead( + (vip_deeplab_options.instance.next_regression_head + .head_channels), + (vip_deeplab_options.instance.next_regression_head + .output_channels), + common.PRED_NEXT_OFFSET_MAP_KEY, + name='next_instance_regression_head', + conv_type=(vip_deeplab_options.instance.next_regression_head + .head_conv_type), + bn_layer=bn_layer)) + self._next_high_level_feature_name = decoder_options.feature_key + + def reset_pooling_layer(self): + """Resets the ASPP pooling layers to global average pooling.""" + self._semantic_decoder.reset_pooling_layer() + if self._instance_decoder is not None: + self._instance_decoder.reset_pooling_layer() + if self._next_instance_decoder is not None: + self._next_instance_decoder.reset_pooling_layer() + + def set_pool_size(self, pool_size): + """Sets the pooling size of the ASPP pooling layers. + + Args: + pool_size: A tuple specifying the pooling size of the ASPP pooling layers. + """ + self._semantic_decoder.set_pool_size(pool_size) + if self._instance_decoder is not None: + self._instance_decoder.set_pool_size(pool_size) + if self._next_instance_decoder is not None: + self._next_instance_decoder.set_pool_size(pool_size) + + def get_pool_size(self): + return self._semantic_decoder.get_pool_size() + + @property + def checkpoint_items(self): + items = { + common.CKPT_SEMANTIC_DECODER: + self._semantic_decoder, + common.CKPT_SEMANTIC_HEAD_WITHOUT_LAST_LAYER: + self._semantic_head.conv_block, + common.CKPT_SEMANTIC_LAST_LAYER: + self._semantic_head.final_conv + } + if self._instance_decoder is not None: + instance_items = { + common.CKPT_INSTANCE_DECODER: + self._instance_decoder, + common.CKPT_INSTANCE_CENTER_HEAD_WITHOUT_LAST_LAYER: + self._instance_center_head.conv_block, + common.CKPT_INSTANCE_CENTER_HEAD_LAST_LAYER: + self._instance_center_head.final_conv, + common.CKPT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER: + self._instance_regression_head.conv_block, + common.CKPT_INSTANCE_REGRESSION_HEAD_LAST_LAYER: + self._instance_regression_head.final_conv, + } + items.update(instance_items) + if self._next_instance_decoder is not None: + next_instance_items = { + common.CKPT_NEXT_INSTANCE_DECODER: + self._next_instance_decoder, + common.CKPT_NEXT_INSTANCE_REGRESSION_HEAD_WITHOUT_LAST_LAYER: + self._next_instance_regression_head.conv_block, + common.CKPT_NEXT_INSTANCE_REGRESSION_HEAD_LAST_LAYER: + self._next_instance_regression_head.final_conv, + } + items.update(next_instance_items) + return items + + def call(self, features, next_features, training=False): + """Performs a forward pass. + + Args: + features: An input dict of tf.Tensor with shape [batch, height, width, + channels]. Different keys should point to different features extracted + by the encoder, e.g. low-level or high-level features. + next_features: An input dict of tf.Tensor similar to features. The + features are computed with the next frame as input. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + A dictionary containing the results of the semantic segmentation head and + depending on the configuration also of the instance segmentation head. + """ + + semantic_features = self._semantic_decoder(features, training=training) + results = self._semantic_head(semantic_features, training=training) + + if self._instance_decoder is not None: + instance_features = self._instance_decoder(features, training=training) + instance_center_predictions = self._instance_center_head( + instance_features, training=training) + instance_regression_predictions = self._instance_regression_head( + instance_features, training=training) + + if results.keys() & instance_center_predictions.keys(): + raise ValueError('The keys of the semantic branch and the instance ' + 'center branch overlap. Please use unique keys.') + results.update(instance_center_predictions) + + if results.keys() & instance_regression_predictions.keys(): + raise ValueError('The keys of the semantic branch and the instance ' + 'regression branch overlap. Please use unique keys.') + results.update(instance_regression_predictions) + + if self._next_instance_decoder is not None: + # We update the high level features in next_features with the concated + # features of the high level features in both features and next_features. + high_level_feature_name = self._next_high_level_feature_name + high_level_features = features[high_level_feature_name] + next_high_level_features = next_features[high_level_feature_name] + next_high_level_features = tf.concat( + [high_level_features, next_high_level_features], axis=3) + next_features[high_level_feature_name] = next_high_level_features + next_regression_features = self._next_instance_decoder( + next_features, training=training) + next_regression_predictions = self._next_instance_regression_head( + next_regression_features, training=training) + if results.keys() & next_regression_predictions.keys(): + raise ValueError('The keys of the next regresion branch overlap.' + 'Please use unique keys.') + results.update(next_regression_predictions) + + return results diff --git a/model/deeplab.py b/model/deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..617908e7469ba77e5458156aca948162b22752b2 --- /dev/null +++ b/model/deeplab.py @@ -0,0 +1,280 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains the DeepLab meta architecture.""" +import collections +import functools +from typing import Any, Dict, Text, Tuple + +from absl import logging +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.data import dataset +from deeplab2.model import builder +from deeplab2.model import utils +from deeplab2.model.post_processor import post_processor_builder + +_OFFSET_OUTPUT = 'offset' + + +class DeepLab(tf.keras.Model): + """This class represents the DeepLab meta architecture. + + This class supports four architectures of the DeepLab family: DeepLab V3, + DeepLab V3+, Panoptic-DeepLab, and MaX-DeepLab. The exact architecture must be + defined during initialization. + """ + + def __init__(self, + config: config_pb2.ExperimentOptions, + dataset_descriptor: dataset.DatasetDescriptor): + """Initializes a DeepLab architecture. + + Args: + config: A config_pb2.ExperimentOptions configuration. + dataset_descriptor: A dataset.DatasetDescriptor. + + Raises: + ValueError: If MaX-DeepLab is used with multi-scale inference. + """ + super(DeepLab, self).__init__(name='DeepLab') + + if config.trainer_options.solver_options.use_sync_batchnorm: + logging.info('Synchronized Batchnorm is used.') + bn_layer = functools.partial( + tf.keras.layers.experimental.SyncBatchNormalization, + momentum=config.trainer_options.solver_options.batchnorm_momentum, + epsilon=config.trainer_options.solver_options.batchnorm_epsilon) + else: + logging.info('Standard (unsynchronized) Batchnorm is used.') + bn_layer = functools.partial( + tf.keras.layers.BatchNormalization, + momentum=config.trainer_options.solver_options.batchnorm_momentum, + epsilon=config.trainer_options.solver_options.batchnorm_epsilon) + + # Divide weight decay by 2 to match the implementation of tf.nn.l2_loss. In + # this way, we allow our users to use a normal weight decay (e.g., 1e-4 for + # ResNet variants) in the config textproto. Then, we pass the adjusted + # weight decay (e.g., 5e-5 for ResNets) to keras in order to exactly match + # the commonly used tf.nn.l2_loss in TF1. References: + # https://github.com/tensorflow/models/blob/68ee72ae785274156b9e943df4145b257cd78b32/official/vision/beta/tasks/image_classification.py#L41 + # https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2 + # https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss + self._encoder = builder.create_encoder( + config.model_options.backbone, bn_layer, + conv_kernel_weight_decay=( + config.trainer_options.solver_options.weight_decay / 2)) + + self._decoder = builder.create_decoder( + config.model_options, bn_layer, dataset_descriptor.ignore_label) + + self._is_max_deeplab = ( + config.model_options.WhichOneof('meta_architecture') == 'max_deeplab') + self._post_processor = post_processor_builder.get_post_processor( + config, dataset_descriptor) + + # The ASPP pooling size is always set to train crop size, which is found to + # be experimentally better. + pool_size = config.train_dataset_options.crop_size + output_stride = float(config.model_options.backbone.output_stride) + pool_size = tuple( + utils.scale_mutable_sequence(pool_size, 1.0 / output_stride)) + logging.info('Setting pooling size to %s', pool_size) + self.set_pool_size(pool_size) + + # Variables for multi-scale inference. + self._add_flipped_images = config.evaluator_options.add_flipped_images + if not config.evaluator_options.eval_scales: + self._eval_scales = [1.0] + else: + self._eval_scales = config.evaluator_options.eval_scales + if self._is_max_deeplab and ( + self._add_flipped_images or len(self._eval_scales) > 1): + raise ValueError( + 'MaX-DeepLab does not support multi-scale inference yet.') + + def call(self, + input_tensor: tf.Tensor, + training: bool = False) -> Dict[Text, Any]: + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. The input tensor should contain batches of RGB images. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + A dictionary containing the results of the specified DeepLab architecture. + The results are bilinearly upsampled to input size before returning. + """ + # Normalize the input in the same way as Inception. We normalize it outside + # the encoder so that we can extend encoders to different backbones without + # copying the normalization to each encoder. We normalize it after data + # preprocessing because it is faster on TPUs than on host CPUs. The + # normalization should not increase TPU memory consumption because it does + # not require gradient. + input_tensor = input_tensor / 127.5 - 1.0 + # Get the static spatial shape of the input tensor. + _, input_h, input_w, _ = input_tensor.get_shape().as_list() + if training: + result_dict = self._decoder( + self._encoder(input_tensor, training=training), training=training) + result_dict = self._resize_predictions( + result_dict, + target_h=input_h, + target_w=input_w) + else: + result_dict = collections.defaultdict(list) + # Evaluation mode where one could perform multi-scale inference. + scale_1_pool_size = self.get_pool_size() + logging.info('Eval with scales %s', self._eval_scales) + for eval_scale in self._eval_scales: + # Get the scaled images/pool_size for each scale. + scaled_images, scaled_pool_size = ( + self._scale_images_and_pool_size( + input_tensor, list(scale_1_pool_size), eval_scale)) + # Update the ASPP pool size for different eval scales. + self.set_pool_size(tuple(scaled_pool_size)) + logging.info('Eval scale %s; setting pooling size to %s', + eval_scale, scaled_pool_size) + pred_dict = self._decoder( + self._encoder(scaled_images, training=training), training=training) + # MaX-DeepLab skips this resizing and upsamples the mask outputs in + # self._post_processor. + pred_dict = self._resize_predictions( + pred_dict, + target_h=input_h, + target_w=input_w) + # Change the semantic logits to probabilities with softmax. Note + # one should remove semantic logits for faster inference. We still + # keep them since they will be used to compute evaluation loss. + pred_dict[common.PRED_SEMANTIC_PROBS_KEY] = tf.nn.softmax( + pred_dict[common.PRED_SEMANTIC_LOGITS_KEY]) + # Store the predictions from each scale. + for output_type, output_value in pred_dict.items(): + result_dict[output_type].append(output_value) + if self._add_flipped_images: + pred_dict_reverse = self._decoder( + self._encoder(tf.reverse(scaled_images, [2]), training=training), + training=training) + pred_dict_reverse = self._resize_predictions( + pred_dict_reverse, + target_h=input_h, + target_w=input_w, + reverse=True) + # Change the semantic logits to probabilities with softmax. + pred_dict_reverse[common.PRED_SEMANTIC_PROBS_KEY] = tf.nn.softmax( + pred_dict_reverse[common.PRED_SEMANTIC_LOGITS_KEY]) + # Store the predictions from each scale. + for output_type, output_value in pred_dict_reverse.items(): + result_dict[output_type].append(output_value) + # Set back the pool_size for scale 1.0, the original setting. + self.set_pool_size(tuple(scale_1_pool_size)) + # Average results across scales. + for output_type, output_value in result_dict.items(): + result_dict[output_type] = tf.reduce_mean( + tf.stack(output_value, axis=0), axis=0) + # Post-process the results. + result_dict.update(self._post_processor(result_dict)) + + if common.PRED_CENTER_HEATMAP_KEY in result_dict: + result_dict[common.PRED_CENTER_HEATMAP_KEY] = tf.squeeze( + result_dict[common.PRED_CENTER_HEATMAP_KEY], axis=3) + return result_dict + + def reset_pooling_layer(self): + """Resets the ASPP pooling layer to global average pooling.""" + self._decoder.reset_pooling_layer() + + def set_pool_size(self, pool_size: Tuple[int, int]): + """Sets the pooling size of the ASPP pooling layer. + + Args: + pool_size: A tuple specifying the pooling size of the ASPP pooling layer. + """ + self._decoder.set_pool_size(pool_size) + + def get_pool_size(self): + return self._decoder.get_pool_size() + + @property + def checkpoint_items(self) -> Dict[Text, Any]: + items = dict(encoder=self._encoder) + items.update(self._decoder.checkpoint_items) + return items + + def _resize_predictions(self, result_dict, target_h, target_w, reverse=False): + """Resizes predictions to the target height and width. + + This function resizes the items in the result_dict to the target height and + width. The items are optionally reversed w.r.t width if `reverse` is True. + + Args: + result_dict: A dictionary storing prediction results to be resized. + target_h: An integer, the target height. + target_w: An integer, the target width. + reverse: A boolean, reversing the prediction result w.r.t. width. + + Returns: + Resized (or optionally reversed) result_dict. + """ + # The default MaX-DeepLab paper does not upsample any output during training + # in order to save GPU/TPU memory, but upsampling might lead to better + # performance. + if self._is_max_deeplab: + return result_dict + for key, value in result_dict.items(): + if reverse: + value = tf.reverse(value, [2]) + # Special care to offsets: need to flip x-offsets. + if _OFFSET_OUTPUT in key: + offset_y, offset_x = tf.split( + value=value, num_or_size_splits=2, axis=3) + offset_x *= -1 + value = tf.concat([offset_y, offset_x], 3) + if _OFFSET_OUTPUT in key: + result_dict[key] = utils.resize_and_rescale_offsets( + value, [target_h, target_w]) + else: + result_dict[key] = utils.resize_bilinear( + value, [target_h, target_w]) + return result_dict + + def _scale_images_and_pool_size(self, images, pool_size, scale): + """Scales images and pool_size w.r.t. scale. + + Args: + images: An input tensor with shape [batch, height, width, 3]. + pool_size: A list with two elements, specifying the pooling size + of ASPP pooling layer. + scale: A float, used to scale the input images and pool_size. + + Returns: + Scaled images, and pool_size. + """ + if scale == 1.0: + scaled_images = images + scaled_pool_size = pool_size + else: + image_size = images.get_shape().as_list()[1:3] + scaled_image_size = utils.scale_mutable_sequence(image_size, scale) + scaled_images = utils.resize_bilinear(images, scaled_image_size) + scaled_pool_size = [None, None] + if pool_size != [None, None]: + scaled_pool_size = utils.scale_mutable_sequence(pool_size, scale) + return scaled_images, scaled_pool_size diff --git a/model/deeplab_test.py b/model/deeplab_test.py new file mode 100644 index 0000000000000000000000000000000000000000..ab90af61157ae23f92890d49f1490e1bb1cd7a30 --- /dev/null +++ b/model/deeplab_test.py @@ -0,0 +1,252 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for deeplab.""" + +import os + +import numpy as np +import tensorflow as tf + +from google.protobuf import text_format +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.data import dataset +from deeplab2.model import deeplab +from deeplab2.model import utils +# resources dependency + +_CONFIG_PATH = 'deeplab2/configs/example' + + +def _read_proto_file(filename, proto): + filename = filename # OSS: removed internal filename loading. + with tf.io.gfile.GFile(filename, 'r') as proto_file: + return text_format.ParseLines(proto_file, proto) + + +def _create_model_from_test_proto(file_name, + dataset_name='cityscapes_panoptic'): + proto_filename = os.path.join(_CONFIG_PATH, file_name) + config = _read_proto_file(proto_filename, config_pb2.ExperimentOptions()) + return deeplab.DeepLab(config, + dataset.MAP_NAME_TO_DATASET_INFO[dataset_name] + ), config + + +class DeeplabTest(tf.test.TestCase): + + def test_deeplab_with_deeplabv3(self): + model, experiment_options = _create_model_from_test_proto( + 'example_cityscapes_deeplabv3.textproto') + train_crop_size = tuple( + experiment_options.train_dataset_options.crop_size) + input_tensor = tf.random.uniform( + shape=(2, train_crop_size[0], train_crop_size[1], 3)) + expected_semantic_shape = [ + 2, train_crop_size[0], train_crop_size[1], + experiment_options.model_options.deeplab_v3.num_classes] + resulting_dict = model(input_tensor) + self.assertListEqual( + resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(), + expected_semantic_shape) + num_params = np.sum( + [np.prod(v.get_shape().as_list()) for v in model.trainable_weights]) + self.assertEqual(num_params, 39638355) + + def test_deeplab_with_deeplabv3plus(self): + model, experiment_options = _create_model_from_test_proto( + 'example_cityscapes_deeplabv3plus.textproto') + train_crop_size = tuple( + experiment_options.train_dataset_options.crop_size) + input_tensor = tf.random.uniform( + shape=(2, train_crop_size[0], train_crop_size[1], 3)) + expected_semantic_shape = [ + 2, train_crop_size[0], train_crop_size[1], + experiment_options.model_options.deeplab_v3_plus.num_classes] + resulting_dict = model(input_tensor) + self.assertListEqual( + resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(), + expected_semantic_shape) + num_params = np.sum( + [np.prod(v.get_shape().as_list()) for v in model.trainable_weights]) + self.assertEqual(num_params, 39210947) + + def test_deeplab_with_deeplabv3_mv3l(self): + model, experiment_options = _create_model_from_test_proto( + 'example_cityscapes_deeplabv3_mv3l.textproto') + train_crop_size = tuple( + experiment_options.train_dataset_options.crop_size) + input_tensor = tf.random.uniform( + shape=(2, train_crop_size[0], train_crop_size[1], 3)) + expected_semantic_shape = [ + 2, train_crop_size[0], train_crop_size[1], + experiment_options.model_options.deeplab_v3.num_classes] + resulting_dict = model(input_tensor) + self.assertListEqual( + resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(), + expected_semantic_shape) + num_params = np.sum( + [np.prod(v.get_shape().as_list()) for v in model.trainable_weights]) + self.assertEqual(num_params, 11024963) + + def test_deeplab_with_panoptic_deeplab(self): + model, experiment_options = _create_model_from_test_proto( + 'example_cityscapes_panoptic_deeplab.textproto') + train_crop_size = tuple( + experiment_options.train_dataset_options.crop_size) + input_tensor = tf.random.uniform( + shape=(2, train_crop_size[0], train_crop_size[1], 3)) + expected_semantic_shape = [ + 2, train_crop_size[0], train_crop_size[1], + experiment_options.model_options.panoptic_deeplab.semantic_head. + output_channels] + expected_instance_center_shape = [ + 2, train_crop_size[0], train_crop_size[1]] + expected_instance_regression_shape = [ + 2, train_crop_size[0], train_crop_size[1], 2] + resulting_dict = model(input_tensor) + self.assertListEqual( + resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(), + expected_semantic_shape) + self.assertListEqual( + resulting_dict[common.PRED_INSTANCE_SCORES_KEY].shape.as_list(), + expected_instance_center_shape) + self.assertListEqual( + resulting_dict[common.PRED_CENTER_HEATMAP_KEY].shape.as_list(), + expected_instance_center_shape) + self.assertListEqual( + resulting_dict[common.PRED_OFFSET_MAP_KEY].shape.as_list(), + expected_instance_regression_shape) + num_params = np.sum( + [np.prod(v.get_shape().as_list()) for v in model.trainable_weights]) + self.assertEqual(num_params, 54973702) + + def test_deeplab_with_panoptic_deeplab_mv3l(self): + model, experiment_options = _create_model_from_test_proto( + 'example_cityscapes_panoptic_deeplab_mv3l.textproto') + train_crop_size = tuple( + experiment_options.train_dataset_options.crop_size) + input_tensor = tf.random.uniform( + shape=(2, train_crop_size[0], train_crop_size[1], 3)) + expected_semantic_shape = [ + 2, train_crop_size[0], train_crop_size[1], + experiment_options.model_options.panoptic_deeplab.semantic_head. + output_channels] + expected_instance_center_shape = [ + 2, train_crop_size[0], train_crop_size[1]] + expected_instance_regression_shape = [ + 2, train_crop_size[0], train_crop_size[1], 2] + resulting_dict = model(input_tensor) + self.assertListEqual( + resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(), + expected_semantic_shape) + self.assertListEqual( + resulting_dict[common.PRED_INSTANCE_SCORES_KEY].shape.as_list(), + expected_instance_center_shape) + self.assertListEqual( + resulting_dict[common.PRED_CENTER_HEATMAP_KEY].shape.as_list(), + expected_instance_center_shape) + self.assertListEqual( + resulting_dict[common.PRED_OFFSET_MAP_KEY].shape.as_list(), + expected_instance_regression_shape) + num_params = np.sum( + [np.prod(v.get_shape().as_list()) for v in model.trainable_weights]) + self.assertEqual(num_params, 18226550) + + def test_deeplab_with_max_deeplab(self): + model, experiment_options = _create_model_from_test_proto( + 'example_coco_max_deeplab.textproto', dataset_name='coco_panoptic') + train_crop_size = tuple( + experiment_options.train_dataset_options.crop_size) + input_tensor = tf.random.uniform( + shape=(2, train_crop_size[0], train_crop_size[1], 3)) + stride_4_size = utils.scale_mutable_sequence(train_crop_size, 0.25) + expected_semantic_shape = [ + 2, stride_4_size[0], stride_4_size[1], experiment_options.model_options. + max_deeplab.auxiliary_semantic_head.output_channels] + expected_transformer_class_logits_shape = [ + 2, 128, experiment_options.model_options. + max_deeplab.auxiliary_semantic_head.output_channels] + expected_pixel_space_normalized_feature_shape = [ + 2, stride_4_size[0], stride_4_size[1], experiment_options.model_options. + max_deeplab.pixel_space_head.output_channels] + expected_pixel_space_mask_logits_shape = [ + 2, stride_4_size[0], stride_4_size[1], 128] + resulting_dict = model(input_tensor, training=True) + self.assertListEqual( + resulting_dict[common.PRED_SEMANTIC_LOGITS_KEY].shape.as_list(), + expected_semantic_shape) + self.assertListEqual( + resulting_dict[ + common.PRED_TRANSFORMER_CLASS_LOGITS_KEY].shape.as_list(), + expected_transformer_class_logits_shape) + self.assertListEqual( + resulting_dict[ + common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY].shape.as_list(), + expected_pixel_space_normalized_feature_shape) + self.assertListEqual( + resulting_dict[common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY].shape.as_list(), + expected_pixel_space_mask_logits_shape) + num_params = 0 + for v in model.trainable_weights: + params = np.prod(v.get_shape().as_list()) + # Exclude the auxiliary semantic head. + if 'auxiliary_semantic' not in v.name: + num_params += params + self.assertEqual(num_params, 61900200) # 61.9M in the paper. + + def test_deeplab_errors(self): + proto_filename = os.path.join( + _CONFIG_PATH, 'example_cityscapes_panoptic_deeplab.textproto') + experiment_options = _read_proto_file(proto_filename, + config_pb2.ExperimentOptions()) + + with self.subTest('ResNet error.'): + with self.assertRaises(ValueError): + experiment_options.model_options.backbone.name = 'not_a_resnet_backbone' + _ = deeplab.DeepLab(experiment_options, + dataset.CITYSCAPES_PANOPTIC_INFORMATION) + + with self.subTest('Encoder family error.'): + with self.assertRaises(ValueError): + experiment_options.model_options.backbone.name = 'not_a_backbone' + _ = deeplab.DeepLab(experiment_options, + dataset.CITYSCAPES_PANOPTIC_INFORMATION) + + def test_deeplab_set_pooling(self): + model, _ = _create_model_from_test_proto( + 'example_cityscapes_panoptic_deeplab.textproto') + pool_size = (10, 10) + model.set_pool_size(pool_size) + + self.assertTupleEqual( + model._decoder._semantic_decoder._aspp._aspp_pool._pool_size, pool_size) + self.assertTupleEqual( + model._decoder._instance_decoder._aspp._aspp_pool._pool_size, pool_size) + + def test_deeplab_reset_pooling(self): + model, _ = _create_model_from_test_proto( + 'example_cityscapes_panoptic_deeplab.textproto') + model.reset_pooling_layer() + pool_size = (None, None) + self.assertTupleEqual( + model._decoder._semantic_decoder._aspp._aspp_pool._pool_size, pool_size) + self.assertTupleEqual( + model._decoder._instance_decoder._aspp._aspp_pool._pool_size, pool_size) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/encoder/__init__.py b/model/encoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03 --- /dev/null +++ b/model/encoder/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/model/encoder/axial_resnet.py b/model/encoder/axial_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..5e54ec52c73a4ed32f882b44717a163800938787 --- /dev/null +++ b/model/encoder/axial_resnet.py @@ -0,0 +1,776 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implements Axial-ResNets proposed in Axial-DeepLab [1]. + +[1] Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. +""" + +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import activations +from deeplab2.model.layers import axial_block_groups +from deeplab2.model.layers import convolutions +from deeplab2.model.layers import resized_fuse +from deeplab2.model.layers import stems + +# Add a suffix in layer names that indicate if the current layer is a part of +# the backbone or an extra layer, i.e. if the current layer will be pretrained +# or not. This name will be used when we apply 10x larger learning rates for +# extra parameters that have not been pretrained, in panoptic segmentation. +# This keyword is reserved and should not be a part of the variable names in a +# classification pretrained backbone. +EXTRA = 'extra' +# Similarly, we will apply 10x larger learning rates on the memory feature. +# This global variable name will be accessed when we build the optimizers. This +# keyword is reserved and should not be a part of the variable names in a +# classification pretrained backbone. +MEMORY_FEATURE = 'memory_feature' + + +class AxialResNet(tf.keras.Model): + """An Axial-ResNet model as proposed in Axial-DeepLab [1] and MaX-DeepLab [2]. + + An Axial-ResNet [1] replaces 3x3 convolutions in a Resnet by axial-attention + layers. A dual-path transformer [2] and a stacked decoder [2] can be used + optionally. In addition, this class supports scaling models with SWideRNet [3] + and augmenting convolutions with Switchable Atrous Convolution [4]. + + Reference: + [1] Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + [2] MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + [3] Scaling Wide Residual Networks for Panoptic Segmentation, + https://arxiv.org/abs/2011.11675 + Liang-Chieh Chen, Huiyu Wang, Siyuan Qiao. + [4] DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable + Atrous Convolution, CVPR 2021. https://arxiv.org/abs/2006.02334 + Siyuan Qiao, Liang-Chieh Chen, Alan Yuille. + """ + + def __init__(self, + name, + num_blocks=(3, 4, 6, 3), + backbone_layer_multiplier=1.0, + width_multiplier=1.0, + stem_width_multiplier=1.0, + output_stride=16, + classification_mode=False, + backbone_type='resnet_beta', + use_axial_beyond_stride=16, + backbone_use_transformer_beyond_stride=32, + extra_decoder_use_transformer_beyond_stride=32, + backbone_decoder_num_stacks=0, + backbone_decoder_blocks_per_stage=1, + extra_decoder_num_stacks=0, + extra_decoder_blocks_per_stage=1, + max_num_mask_slots=128, + num_mask_slots=128, + memory_channels=256, + base_transformer_expansion=1.0, + global_feed_forward_network_channels=256, + high_resolution_output_stride=4, + activation='relu', + block_group_config=None, + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0): + """Initializes an AxialResNet model. + + Args: + name: A string, the name of the model. + num_blocks: A list of 4 integers. It denotes the number of blocks to + include in the last 4 stages or block groups. Each group consists of + blocks that output features of the same resolution. Defaults to (3, 4, + 6, 3) as in MaX-DeepLab-S. + backbone_layer_multiplier: A float, layer_multiplier for the backbone, + excluding the STEM. This flag controls the number of layers. Defaults to + 1.0 as in MaX-DeepLab-S. + width_multiplier: A float, the channel multiplier for the block groups. + Defaults to 1.0 as in MaX-DeepLab-S. + stem_width_multiplier: A float, the channel multiplier for stem + convolutions. Defaults to 1.0 as in MaX-DeepLab-S. + output_stride: An integer, the maximum ratio of input to output spatial + resolution. Defaults to 16 as in MaX-DeepLab-S. + classification_mode: A boolean, whether to perform in a classification + mode. If it is True, this function directly returns backbone feature + endpoints. Note that these feature endpoints can also be used directly + for Panoptic-DeepLab or Motion-DeepLab. If it is False, this function + builds MaX-DeepLab extra decoder layers and extra transformer layers. + Defaults to False as in MaX-DeepLab. + backbone_type: A string, the type of backbone. Supports 'resnet', + 'resnet_beta', and 'wider_resnet'. It controls both the stem type and + the residual block type. Defaults to 'resnet_beta' as in MaX-DeepLab-S. + use_axial_beyond_stride: An integer, the stride beyond which we use axial + attention. Set to 0 if no axial attention is desired. Defaults to 16 as + in MaX-DeepLab. + backbone_use_transformer_beyond_stride: An integer, the stride beyond + which we use a memory path transformer block on top of a regular pixel + path block, in the backbone. Set to 0 if no transformer block is desired + in the backbone. Defaults to 32 as in MaX-DeepLab-S. + extra_decoder_use_transformer_beyond_stride: An integer, the stride beyond + which we use a memory path transformer block on top of a regular pixel + path block, in the extra decoder stages. Set to 0 if no transformer + block is desired in the extra decoder stages. Defaults to 32 as in + MaX-DeepLab-S. + backbone_decoder_num_stacks: An integer, the number of decoder stacks + (introduced in MaX-DeepLab) that we use in the backbone. The stacked + decoders are applied in a stacked hour-glass style. Defaults to 0 as in + MaX-DeepLab-S. + backbone_decoder_blocks_per_stage: An integer, the number of consecutive + residual blocks to apply for each decoder stage, in the backbone. + Defaults to 1 as in MaX-DeepLab-S. + extra_decoder_num_stacks: An integer, the number of decoder stacks + (introduced in MaX-DeepLab) that we use in the extra decoder layers. It + is different from backbone_decoder_blocks_per_stage in that the extra + decoder stacks will be trained from scratch on segmentation tasks, + instead of pretrained on ImageNet classification. Defaults to 0 as in + MaX-DeepLab-S. + extra_decoder_blocks_per_stage: An integer, the number of consecutive + residual blocks to apply for each decoder stage, in the extra decoder + stages. Defaults to 1 as in MaX-DeepLab-S. + max_num_mask_slots: An integer, the maximum possible number of mask slots + that will be used. This will be used in a pretraining-finetuning use + case with different num_mask_slots: We can set max_num_mask_slots to the + maximum possible num_mask_slots, and then the saved checkpoint can be + loaded for finetuning with a different num_mask_slots. Defaults to 128 + as in MaX-DeepLab. + num_mask_slots: An integer, the number of mask slots that will be used. + Defaults to 128 as in MaX-DeepLab-S. + memory_channels: An integer, the number of channels for the whole memory + path. Defaults to 256 as in MaX-DeepLab-S. + base_transformer_expansion: A float, the base width expansion rate for + transformer layers. Defaults to 1.0 as in MaX-DeepLab-S. + global_feed_forward_network_channels: An integer, the number of channels + in the final global feed forward network, i.e. the mask feature head and + the mask class head. Defaults to 256 as in MaX-DeepLab-S. + high_resolution_output_stride: An integer, the final decoding output + stride. Defaults to 4 as in MaX-DeepLab-S. + activation: A string, type of activation function to apply. Support + 'relu', 'swish' (or 'silu'), 'gelu', 'approximated_gelu', and 'elu'. + block_group_config: An argument dictionary that will be passed to + block_group. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + + Raises: + ValueError: If backbone_type is not one of 'resnet', 'resnet_beta', or + 'wider_resnet'. + ValueError: If extra_decoder_blocks_per_stage is not greater than zero. + """ + super(AxialResNet, self).__init__(name=name) + + if extra_decoder_blocks_per_stage <= 0: + raise ValueError( + 'Extra_decoder_blocks_per_stage should be great than zero.') + if block_group_config is None: + block_group_config = {} + + # Compute parameter lists for block_groups. We consider five stages so that + # it is general enough to cover fully axial resnets and wider resnets. + total_strides_list = [1, 2, 4, 8, 16] + + # Append 3 blocks for the first stage of fully axial resnets and wider + # resnets. + num_blocks_list = [3] + utils.scale_int_list(list(num_blocks), + backbone_layer_multiplier) + strides_list = [2] * 5 + + # Expand the transformer and the block filters with the stride. + transformer_expansions_list = [] + filters_list = [] + for index, stride in enumerate(total_strides_list): + # Reduce the number of channels when we apply transformer to low level + # features (stride = 2, 4, or 8). The base_transformer_expansion is used + # for stride = 16, i.e. the standard output_stride for MaX-DeepLab-S. + transformer_expansions_list.append(base_transformer_expansion * stride / + 16.0) + # Compute the base number of filters in each stage. For example, the last + # stage of ResNet50 has an input stride of 16, then we compute the base + # number of filters for a bottleneck block as 16 * 32 = 512, which is the + # number of filters for the 3x3 convolution in those blocks. + if backbone_type == 'wider_resnet' and index == 0: + # SWideRNet variants use stem_width_multiplier for the first block. + filters_list.append(int(round(stride * 32 * stem_width_multiplier))) + else: + filters_list.append(int(round(stride * 32 * width_multiplier))) + + self._num_mask_slots = None + # Initialize memory_feature only when a transformer block is used. + self._use_memory_feature = (backbone_use_transformer_beyond_stride or + (extra_decoder_use_transformer_beyond_stride and + (not classification_mode))) + if self._use_memory_feature: + self._memory_feature_shape = (1, max_num_mask_slots, memory_channels) + self._memory_feature_initializer = ( + tf.keras.initializers.TruncatedNormal(stddev=1.0)) + self._memory_feature_regularizer = tf.keras.regularizers.l2( + conv_kernel_weight_decay) + if num_mask_slots: + self._num_mask_slots = num_mask_slots + + # Use a convolutional stem except fully axial cases. + stem_channels = int(round(64 * stem_width_multiplier)) + self._activation_fn = activations.get_activation(activation) + if use_axial_beyond_stride == 1: + self._stem = tf.identity + first_block_index = 0 + elif backbone_type.lower() == 'wider_resnet': + self._stem = convolutions.Conv2DSame( + output_channels=stem_channels, + kernel_size=3, + name='stem', + strides=2, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='none', + conv_kernel_weight_decay=conv_kernel_weight_decay) + # Wider ResNet has five residual block stages, so we start from index 0. + first_block_index = 0 + # Since we have applied the first strided convolution here, we do not use + # a stride for the first stage (which will operate on stride 2). + strides_list[0] = 1 + total_strides_list[0] = 2 + elif backbone_type.lower() == 'resnet_beta': + self._stem = stems.InceptionSTEM( + bn_layer=bn_layer, + width_multiplier=stem_width_multiplier, + conv_kernel_weight_decay=conv_kernel_weight_decay, + activation=activation) + first_block_index = 1 + elif backbone_type.lower() == 'resnet': + self._stem = convolutions.Conv2DSame( + output_channels=stem_channels, + kernel_size=7, + name='stem', + strides=2, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='none', + conv_kernel_weight_decay=conv_kernel_weight_decay) + first_block_index = 1 + else: + raise ValueError(backbone_type + ' is not supported.') + + self._first_block_index = first_block_index + # Apply standard ResNet block groups. We use first_block_index to + # distinguish models with 4 stages and those with 5 stages. + for index in range(first_block_index, 5): + current_name = '_stage{}'.format(index + 1) + utils.safe_setattr(self, current_name, axial_block_groups.BlockGroup( + filters=filters_list[index], + num_blocks=num_blocks_list[index], + name=utils.get_layer_name(current_name), + original_resnet_stride=strides_list[index], + original_resnet_input_stride=total_strides_list[index], + output_stride=output_stride, + backbone_type=backbone_type, + use_axial_beyond_stride=use_axial_beyond_stride, + use_transformer_beyond_stride=( + backbone_use_transformer_beyond_stride), + transformer_expansion=transformer_expansions_list[index], + activation=activation, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay, + **block_group_config)) + self._backbone_decoder_num_stacks = backbone_decoder_num_stacks + self._classification_mode = classification_mode + self._extra_decoder_num_stacks = extra_decoder_num_stacks + self._output_stride = output_stride + self._high_resolution_output_stride = high_resolution_output_stride + self._width_multiplier = width_multiplier + self._activation = activation + self._bn_layer = bn_layer + self._conv_kernel_weight_decay = conv_kernel_weight_decay + self._backbone_use_transformer_beyond_stride = ( + backbone_use_transformer_beyond_stride) + self._extra_decoder_use_transformer_beyond_stride = ( + extra_decoder_use_transformer_beyond_stride) + + # Keep track of the current stack so that we know when to stop. + current_stack = 0 + # Track whether we are building the backbone. This will affect the backbone + # related arguments, local learning rate, and so on. + current_is_backbone = True + + if backbone_decoder_num_stacks == 0: + # No stacked decoder is used in the backbone, so we have finished building + # the backbone. We either return the classification endpoints, or continue + # building a non-backbone decoder for panoptic segmentation. + if self._classification_mode: + return + else: + current_is_backbone = False + if not current_is_backbone: + # Now that we have finished building the backbone and no stacked decoder + # is used in the backbone, so we start to build extra (i.e., non-backbone) + # layers for panoptic segmentation. + current_name = '_stage5_' + EXTRA + utils.safe_setattr( + self, current_name, axial_block_groups.BlockGroup( + filters=filters_list[-1], + num_blocks=extra_decoder_blocks_per_stage, + name=utils.get_layer_name(current_name), + original_resnet_stride=1, + original_resnet_input_stride=32, + output_stride=output_stride, + backbone_type=backbone_type, + use_axial_beyond_stride=use_axial_beyond_stride, + use_transformer_beyond_stride=( + extra_decoder_use_transformer_beyond_stride), + transformer_expansion=base_transformer_expansion, + activation=activation, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay, + **block_group_config)) + + # Compute parameter lists for stacked decoder. + total_decoder_num_stacks = ( + backbone_decoder_num_stacks + extra_decoder_num_stacks) + + # Use a function to compute the next stride. + next_stride_fn = lambda x: x // 2 + current_decoder_stride = output_stride + decoder_stage = 0 + + # Exit if we have enough stacks and reach the decoding output stride. + while (current_stack < total_decoder_num_stacks or + current_decoder_stride > high_resolution_output_stride): + decoder_stage += 1 + current_decoder_stride = next_stride_fn(current_decoder_stride) + + if current_decoder_stride == output_stride: + current_stack += 1 + # Always use blocks from the last resnet stage if the current stride is + # output stride (the largest stride). + original_resnet_input_stride = 32 + + # Switch the decoder direction if we reach the largest stride. + next_stride_fn = lambda x: x // 2 + else: + original_resnet_input_stride = current_decoder_stride + + # Scale channels according to the strides. + decoder_channels = original_resnet_input_stride * 64 * width_multiplier + current_transformer_expansion = ( + base_transformer_expansion * current_decoder_stride / 16.0) + + # Apply a decoder block group for building the backbone. + if current_is_backbone: + current_name = '_decoder_stage{}'.format(decoder_stage) + utils.safe_setattr( + self, current_name, axial_block_groups.BlockGroup( + filters=decoder_channels // 4, + num_blocks=backbone_decoder_blocks_per_stage, + name=utils.get_layer_name(current_name), + original_resnet_stride=1, + original_resnet_input_stride=original_resnet_input_stride, + output_stride=output_stride, + backbone_type=backbone_type, + use_axial_beyond_stride=use_axial_beyond_stride, + use_transformer_beyond_stride=( + backbone_use_transformer_beyond_stride), + transformer_expansion=current_transformer_expansion, + activation=activation, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay, + **block_group_config)) + + if (current_decoder_stride == output_stride and + current_stack == backbone_decoder_num_stacks): + # Now that we have finished building the backbone, we either return the + # classification endpoints, or continue building a non-backbone decoder + # for panoptic segmentation. + if classification_mode: + return + else: + current_is_backbone = False + + # Apply a decoder block group for building the extra layers. + if not current_is_backbone: + # Continue building an extra (i.e., non-backbone) decoder for panoptic + # segmentation. + current_name = '_decoder_stage{}_{}'.format(decoder_stage, EXTRA) + utils.safe_setattr( + self, current_name, axial_block_groups.BlockGroup( + filters=decoder_channels // 4, + num_blocks=extra_decoder_blocks_per_stage, + name=utils.get_layer_name(current_name), + original_resnet_stride=1, + original_resnet_input_stride=original_resnet_input_stride, + output_stride=output_stride, + backbone_type=backbone_type, + use_axial_beyond_stride=use_axial_beyond_stride, + use_transformer_beyond_stride=( + extra_decoder_use_transformer_beyond_stride), + transformer_expansion=current_transformer_expansion, + activation=activation, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay, + **block_group_config)) + if current_decoder_stride == high_resolution_output_stride: + next_stride_fn = lambda x: x * 2 + + # Assert that we have already returned if we are building a classifier. + assert not classification_mode + if (backbone_use_transformer_beyond_stride or + extra_decoder_use_transformer_beyond_stride): + # Build extra memory path feed forward networks for the class feature and + # the mask feature. + current_name = '_class_feature_' + EXTRA + utils.safe_setattr( + self, current_name, convolutions.Conv1D( + global_feed_forward_network_channels, + utils.get_layer_name(current_name), + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=activation, + conv_kernel_weight_decay=conv_kernel_weight_decay)) + current_name = '_mask_feature_' + EXTRA + utils.safe_setattr( + self, current_name, convolutions.Conv1D( + global_feed_forward_network_channels, + utils.get_layer_name(current_name), + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=activation, + conv_kernel_weight_decay=conv_kernel_weight_decay)) + + def build(self, input_shape): + """Builds model weights and input shape dependent sub-layers.""" + if self._use_memory_feature: + self._memory_feature = self.add_weight( + name=MEMORY_FEATURE, + shape=self._memory_feature_shape, + initializer=self._memory_feature_initializer, + regularizer=self._memory_feature_regularizer) + else: + self._memory_feature = None + + # Go through the loop to build the ResizedFuse layers. + current_stack = 0 + # Track whether we are building the backbone. This will affect the backbone + # related arguments, local learning rate, and so on. + current_is_backbone = self._backbone_decoder_num_stacks != 0 + total_decoder_num_stacks = ( + self._backbone_decoder_num_stacks + self._extra_decoder_num_stacks) + next_stride_fn = lambda x: x // 2 + current_decoder_stride = self._output_stride + decoder_stage = 0 + while (current_stack < total_decoder_num_stacks or + current_decoder_stride > self._high_resolution_output_stride): + decoder_stage += 1 + current_decoder_stride = next_stride_fn(current_decoder_stride) + if current_decoder_stride == self._output_stride: + current_stack += 1 + original_resnet_input_stride = 32 + next_stride_fn = lambda x: x // 2 + else: + original_resnet_input_stride = current_decoder_stride + # Compute the decoder_channels according to original_resnet_input_stride. + # For example, at stride 4 with width multiplier = 1, we use 4 * 64 = 256 + # channels, which is the same as a standard ResNet. + decoder_channels = int(round( + original_resnet_input_stride * 64 * self._width_multiplier)) + decoder_height, decoder_width = utils.scale_mutable_sequence( + input_shape[1:3], 1.0 / current_decoder_stride) + if current_is_backbone: + current_name = '_decoder_stage{}_resized_fuse'.format(decoder_stage) + else: + current_name = '_decoder_stage{}_{}_resized_fuse'.format( + decoder_stage, EXTRA) + utils.safe_setattr( + self, current_name, resized_fuse.ResizedFuse( + name=utils.get_layer_name(current_name), + height=decoder_height, + width=decoder_width, + num_channels=decoder_channels, + activation=self._activation, + bn_layer=self._bn_layer, + conv_kernel_weight_decay=self._conv_kernel_weight_decay)) + if (current_decoder_stride == self._output_stride and + current_stack == self._backbone_decoder_num_stacks): + # Now that we have finished building the backbone, we either return the + # classification endpoints, or continue building a non-backbone decoder + # for panoptic segmentation. + if self._classification_mode: + return + current_is_backbone = False + if current_decoder_stride == self._high_resolution_output_stride: + next_stride_fn = lambda x: x * 2 + + def call_encoder_before_stacked_decoder(self, inputs, training=False): + """Performs a forward pass of the encoder before stacking decoders. + + Args: + inputs: An input [batch, height, width, channel] tensor. + training: A boolean, whether the model is in training mode. + + Returns: + current_output: An output tensor with shape [batch, new_height, new_width, + new_channel]. + activated_output: An activated output tensor with shape [batch, + new_height, new_width, new_channel]. + memory_feature: None if no transformer is used. A [batch, num_memory, + memory_channel] tensor if transformer is used. + endpoints: A dict, the network endpoints that might be used by DeepLab. + """ + memory_feature = self._memory_feature + if self._use_memory_feature: + if self._num_mask_slots: + memory_feature = self._memory_feature[:, :self._num_mask_slots, :] + memory_feature = tf.tile(memory_feature, + [tf.shape(inputs)[0], 1, 1]) + + endpoints = {} + output = self._stem(inputs) + activated_output = self._activation_fn(output) + endpoints['stage1'] = output + endpoints['res1'] = activated_output + + # Apply standard ResNet block groups. We use first_block_index to + # distinguish models with 4 stages and those with 5 stages. + for index in range(self._first_block_index, 5): + current_name = '_stage{}'.format(index + 1) + current_output, activated_output, memory_feature = ( + getattr(self, current_name)( + (activated_output, memory_feature), training=training)) + endpoints[utils.get_layer_name(current_name)] = current_output + activated_output_name = 'res{}'.format(index + 1) + endpoints[activated_output_name] = activated_output + return current_output, activated_output, memory_feature, endpoints + + def call_stacked_decoder(self, + current_output, + activated_output, + memory_feature, + endpoints, + training=False): + """Performs a forward pass of the stacked decoders. + + Args: + current_output: An output tensor with shape [batch, new_height, new_width, + new_channel]. + activated_output: An activated output tensor with shape [batch, + new_height, new_width, new_channel]. + memory_feature: None if no transformer is used. A [batch, num_memory, + memory_channel] tensor if transformer is used. + endpoints: A dict, the network endpoints that might be used by DeepLab. + training: A boolean, whether the model is in training mode. + + Returns: + memory_feature: None if no transformer is used. A [batch, num_memory, + memory_channel] tensor if transformer is used. + high_resolution_outputs: A list of decoded tensors with + high_resolution_output_stride. + backbone_output: An output tensor of the backbone, with output_stride. + endpoints: A dict, the network endpoints that might be used by DeepLab. + """ + # Keep track of the current stack so that we know when to stop. + current_stack = 0 + # Track whether we are building the backbone. This will affect the backbone + # related arguments, local learning rate, and so on. + current_is_backbone = True + high_resolution_outputs = [] + + if self._backbone_decoder_num_stacks == 0: + # Keep track of the backbone output, since it might be used as the + # semantic feature output. + backbone_output = activated_output + # Now that we have finished building the backbone, we either return the + # classification logits, or continue building a non-backbone decoder for + # panoptic segmentation. + if self._classification_mode: + endpoints['backbone_output'] = backbone_output + return None, None, None, endpoints + else: + current_is_backbone = False + + if not current_is_backbone: + # Build extra layers if we have finished building the backbone. + current_name = '_stage5_' + EXTRA + current_output, activated_output, memory_feature = ( + getattr(self, current_name)( + (activated_output, memory_feature), training=training)) + + # Compute parameter lists for stacked decoder. + total_decoder_num_stacks = ( + self._backbone_decoder_num_stacks + self._extra_decoder_num_stacks) + + # Keep track of all endpoints that will be used in the stacked decoder. + stride_to_features = {} + stride_to_features[min(2, self._output_stride)] = [endpoints['stage1']] + stride_to_features[min(4, self._output_stride)] = [endpoints['stage2']] + stride_to_features[min(8, self._output_stride)] = [endpoints['stage3']] + stride_to_features[min(16, self._output_stride)] = [endpoints['stage4']] + # Only keep the last endpoint from the backbone with the same resolution, + # i.e., if the output stride is 16, the current output will override + # the stride 16 endpoint, endpoints['res4']. + stride_to_features[min(32, self._output_stride)] = [current_output] + + # Use a function to compute the next stride. + next_stride_fn = lambda x: x // 2 + current_decoder_stride = self._output_stride + decoder_stage = 0 + + # Exit if we have enough stacks and reach the decoding output stride. + while (current_stack < total_decoder_num_stacks or + current_decoder_stride > self._high_resolution_output_stride): + decoder_stage += 1 + current_decoder_stride = next_stride_fn(current_decoder_stride) + + if current_decoder_stride == self._output_stride: + current_stack += 1 + # Switch the decoder direction if we reach the largest stride. + next_stride_fn = lambda x: x // 2 + + # Include the current feature and two previous features from the target + # resolution in the decoder. We select two because it contains one upward + # feature and one downward feature, but better choices are possible. + decoder_features_list = ( + [current_output] + + stride_to_features[current_decoder_stride][-2:]) + + # Fuse and resize features with striding, resizing and 1x1 convolutions. + if current_is_backbone: + current_name = '_decoder_stage{}_resized_fuse'.format(decoder_stage) + else: + current_name = '_decoder_stage{}_{}_resized_fuse'.format( + decoder_stage, EXTRA) + activated_output = getattr(self, current_name)( + decoder_features_list, training=training) + + # Apply a decoder block group for building the backbone. + if current_is_backbone: + current_name = '_decoder_stage{}'.format(decoder_stage) + current_output, activated_output, memory_feature = ( + getattr(self, current_name)( + (activated_output, memory_feature), training=training)) + + if (current_decoder_stride == self._output_stride and + current_stack == self._backbone_decoder_num_stacks): + # Keep track of the backbone output, since it might be used as the + # semantic feature output. + backbone_output = activated_output + # Now that we have finished building the backbone, we either return the + # classification logits, or continue building a non-backbone decoder for + # panoptic segmentation. + if self._classification_mode: + endpoints['backbone_output'] = backbone_output + return None, None, None, endpoints + else: + current_is_backbone = False + + # Apply a decoder block group for building the extra layers. + if not current_is_backbone: + current_name = '_decoder_stage{}_{}'.format(decoder_stage, EXTRA) + current_output, activated_output, memory_feature = ( + getattr(self, current_name)( + (activated_output, memory_feature), training=training)) + + # Append the current feature into the feature dict for possible later + # usage. + stride_to_features[current_decoder_stride].append(current_output) + if current_decoder_stride == self._high_resolution_output_stride: + high_resolution_outputs.append(activated_output) + next_stride_fn = lambda x: x * 2 + return memory_feature, high_resolution_outputs, backbone_output, endpoints + + def call_extra_endpoints(self, + memory_feature, + high_resolution_outputs, + backbone_output, + endpoints, + training=False): + """Performs a forward pass to generate extra endpoints. + + Args: + memory_feature: None if no transformer is used. A [batch, num_memory, + memory_channel] tensor if transformer is used. + high_resolution_outputs: A list of decoded tensors with + high_resolution_output_stride. + backbone_output: An output tensor of the backbone, with output_stride. + endpoints: A dict, the network endpoints that might be used by DeepLab. + training: A boolean, whether the model is in training mode. + + Returns: + endpoints: A dict, the network endpoints that might be used by DeepLab. + """ + # Assert that we have already returned if we are building a classifier. + assert not self._classification_mode + if (self._backbone_use_transformer_beyond_stride or + self._extra_decoder_use_transformer_beyond_stride): + # Build extra memory path feed forward networks for the class feature and + # the mask feature. + class_feature = getattr(self, '_class_feature_' + EXTRA)( + memory_feature, training=training) + mask_feature = getattr(self, '_mask_feature_' + EXTRA)( + memory_feature, training=training) + endpoints['transformer_class_feature'] = class_feature + endpoints['transformer_mask_feature'] = mask_feature + + # Output the last high resolution feature as panoptic feature. + endpoints['feature_panoptic'] = high_resolution_outputs[-1] + + # Avoid sharing our panoptic feature with the semantic auxiliary loss. So we + # use the backbone feature or the decoded backbone feature for the semantic + # segmentation head (i.e. the auxiliary loss). + if self._extra_decoder_num_stacks: + endpoints['feature_semantic'] = ( + high_resolution_outputs[self._backbone_decoder_num_stacks]) + else: + endpoints['feature_semantic'] = backbone_output + endpoints['backbone_output'] = backbone_output + return endpoints + + def call(self, inputs, training=False): + """Performs a forward pass. + + Args: + inputs: An input [batch, height, width, channel] tensor. + training: A boolean, whether the model is in training mode. + + Returns: + endpoints: A dict, the network endpoints that might be used by DeepLab. + """ + current_output, activated_output, memory_feature, endpoints = ( + self.call_encoder_before_stacked_decoder(inputs, training=training)) + memory_feature, high_resolution_outputs, backbone_output, endpoints = ( + self.call_stacked_decoder(current_output, + activated_output, + memory_feature, + endpoints, + training=training)) + if self._classification_mode: + return endpoints + endpoints = self.call_extra_endpoints(memory_feature, + high_resolution_outputs, + backbone_output, + endpoints, + training=training) + return endpoints diff --git a/model/encoder/axial_resnet_instances.py b/model/encoder/axial_resnet_instances.py new file mode 100644 index 0000000000000000000000000000000000000000..a110c11cd9a97aec27be98b85b5136af291004ef --- /dev/null +++ b/model/encoder/axial_resnet_instances.py @@ -0,0 +1,493 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Contains Axial-ResNet model instances for Axial-DeepLab and MaX-DeepLab. + +Reference: + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. +""" + +import abc +import collections.abc +import copy + +from absl import logging +import tensorflow as tf + +from deeplab2.model.encoder import axial_resnet + + +def _get_default_config(): + """Gets the default config for Axial-ResNets.""" + # The default config dictionary for an Axial-ResNet is the MaX-DeepLab-S + # architecture for panoptic segmentation. This default config dictionary also + # exactly matches the default arguments of the functions. + default_config = { + 'num_blocks': [3, 4, 6, 3], + 'backbone_layer_multiplier': 1.0, + 'width_multiplier': 1.0, + 'stem_width_multiplier': 1.0, + 'output_stride': 16, + 'classification_mode': False, + 'backbone_type': 'resnet_beta', + 'use_axial_beyond_stride': 16, + 'backbone_use_transformer_beyond_stride': 32, + 'extra_decoder_use_transformer_beyond_stride': 32, + 'backbone_decoder_num_stacks': 0, + 'backbone_decoder_blocks_per_stage': 1, + 'extra_decoder_num_stacks': 0, + 'extra_decoder_blocks_per_stage': 1, + 'max_num_mask_slots': 128, + 'num_mask_slots': 128, + 'memory_channels': 256, + 'base_transformer_expansion': 1.0, + 'global_feed_forward_network_channels': 256, + 'high_resolution_output_stride': 4, + 'activation': 'relu', + 'block_group_config': { + 'attention_bottleneck_expansion': 2, + 'drop_path_keep_prob': 0.8, + 'drop_path_beyond_stride': 16, + 'drop_path_schedule': 'constant', + 'positional_encoding_type': None, + 'use_global_beyond_stride': 0, + 'use_sac_beyond_stride': 0, + 'use_squeeze_and_excite': False, + 'conv_use_recompute_grad': False, + 'axial_use_recompute_grad': True, + 'recompute_within_stride': 0, + 'transformer_use_recompute_grad': False, + 'axial_layer_config': { + 'query_shape': (129, 129), + 'key_expansion': 1, + 'value_expansion': 2, + 'memory_flange': (32, 32), + 'double_global_attention': False, + 'num_heads': 8, + 'use_query_rpe_similarity': True, + 'use_key_rpe_similarity': True, + 'use_content_similarity': True, + 'retrieve_value_rpe': True, + 'retrieve_value_content': True, + 'initialization_std_for_query_key_rpe': 1.0, + 'initialization_std_for_value_rpe': 1.0, + 'self_attention_activation': 'softmax', + }, + 'dual_path_transformer_layer_config': { + 'num_heads': 8, + 'bottleneck_expansion': 2, + 'key_expansion': 1, + 'value_expansion': 2, + 'feed_forward_network_channels': 2048, + 'use_memory_self_attention': True, + 'use_pixel2memory_feedback_attention': True, + 'transformer_activation': 'softmax', + }, + }, + 'bn_layer': tf.keras.layers.BatchNormalization, + 'conv_kernel_weight_decay': 0.0, + } + return default_config + + +def override(config_dict, override_dict): + """Recursively overrides a config dict with another.""" + output_dict = copy.deepcopy(config_dict) + for key, value in override_dict.items(): + if isinstance(value, collections.abc.Mapping): + output_dict[key] = override(config_dict.get(key, {}), value) + else: + output_dict[key] = value + return output_dict + + +class AxialResNetInstance(axial_resnet.AxialResNet): + """A base Axial-ResNet model.""" + + @classmethod + @abc.abstractmethod + def _get_config(cls): + pass + + def __init__(self, name, **kwargs): + """Builds an Axial-ResNet model.""" + # Get the config of the current model. + current_config = self._get_config() + + # Override the default config with the current config. This line can be + # omitted because the default config equals the default arguments of the + # functions that build the model. But we make all the configs explicit here. + current_config = override(_get_default_config(), current_config) + + # Finally, override the current model config with keyword arguments. In this + # way, we still respect arguments passed as keyword arguments, such as + # classification_mode, output_stride, etc. + current_config = override(current_config, kwargs) + logging.info('Axial-ResNet final config: %s', current_config) + super(AxialResNetInstance, self).__init__(name, **current_config) + + +class MaXDeepLabS(AxialResNetInstance): + """MaX-DeepLab-S for panoptic segmentation. + + Reference: + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + """ + + @classmethod + def _get_config(cls): + # Return an empty dictionary as the default values are all set for + # MaX-DeepLab-S. + return {} + + +class MaXDeepLabL(AxialResNetInstance): + """MaX-DeepLab-L for panoptic segmentation. + + Reference: + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + """ + + @classmethod + def _get_config(cls): + return { + 'num_blocks': [3, 6, 3, 3], + 'backbone_type': 'wider_resnet', + 'backbone_use_transformer_beyond_stride': 16, + 'extra_decoder_use_transformer_beyond_stride': 16, + 'backbone_decoder_num_stacks': 1, + 'extra_decoder_num_stacks': 1, + 'extra_decoder_blocks_per_stage': 3, + 'memory_channels': 512, + 'base_transformer_expansion': 2.0, + 'global_feed_forward_network_channels': 512, + 'block_group_config': { + 'attention_bottleneck_expansion': 4, + 'drop_path_beyond_stride': 4, + 'axial_layer_config': { + 'key_expansion': 2, + 'value_expansion': 4, + }, + }, + } + + +class MaXDeepLabSBackbone(MaXDeepLabS): + """MaX-DeepLab-S backbone for image classification pretraining. + + Reference: + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + """ + + @classmethod + def _get_config(cls): + base_config = super(MaXDeepLabSBackbone, cls)._get_config() + # Override the config of MaXDeepLabS. + override_config = { + 'classification_mode': True, + # The transformer blocks are not ImageNet pretrained. They are randomly + # initialized and trained from scratch for panoptic segmentation. + 'backbone_use_transformer_beyond_stride': 0, + } + return override(base_config, override_config) + + +class MaXDeepLabLBackbone(MaXDeepLabL): + """MaX-DeepLab-L backbone for image classification pretraining. + + Reference: + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + """ + + @classmethod + def _get_config(cls): + base_config = super(MaXDeepLabLBackbone, cls)._get_config() + # Override the config of MaXDeepLabL. + override_config = { + 'classification_mode': True, + # The transformer blocks are not ImageNet pretrained. They are randomly + # initialized and trained from scratch for panoptic segmentation. + 'backbone_use_transformer_beyond_stride': 0, + } + return override(base_config, override_config) + + +class ResNet50(AxialResNetInstance): + """A ResNet-50 instance. + + Note that the implementation is different from the original ResNet-50 in: + (1) We apply strided convolutions in the first 3x3 convolution of the first + residual block of a stage. + (2) We replace the strided max pooling layer in the stem by applying strided + convolution in the immediate next residual block. + """ + + @classmethod + def _get_config(cls): + return { + 'classification_mode': True, + 'backbone_type': 'resnet', + 'use_axial_beyond_stride': 0, + 'backbone_use_transformer_beyond_stride': 0, + 'block_group_config': { + 'drop_path_keep_prob': 1.0, + }, + } + + +class ResNet50Beta(ResNet50): + """A ResNet-50 but with inception stem. + + Note that the implementation is different from the original ResNet-50 in: + (1) We apply strided convolutions in the first 3x3 convolution of the first + residual block of a stage. + (2) We replace the strided max pooling layer in the stem by applying strided + convolution in the immediate next residual block. + """ + + @classmethod + def _get_config(cls): + base_config = super(ResNet50Beta, cls)._get_config() + # Override the config of ResNet50. + override_config = { + 'backbone_type': 'resnet_beta', + } + return override(base_config, override_config) + + +class AxialResNetL(ResNet50): + """Axial-ResNet-L for image classification only. + + Axial-ResNet-L is a ResNet50 with use_axial_beyond_stride = 2. + + Reference: + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + """ + + @classmethod + def _get_config(cls): + base_config = super(AxialResNetL, cls)._get_config() + # Override the config of ResNet50. + override_config = { + 'use_axial_beyond_stride': 2, + } + return override(base_config, override_config) + + +class AxialResNetS(ResNet50): + """Axial-ResNet-S for image classification only. + + Axial-ResNet-S is a ResNet50 with use_axial_beyond_stride = 2 and + width_multiplier = 0.5. + + Reference: + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + """ + + @classmethod + def _get_config(cls): + base_config = super(AxialResNetS, cls)._get_config() + # Override the config of ResNet50. + override_config = { + 'width_multiplier': 0.5, + 'use_axial_beyond_stride': 2, + } + return override(base_config, override_config) + + +class AxialDeepLabL(ResNet50Beta): + """Axial-DeepLab-L for panoptic segmentation. + + Axial-DeepLab-L is a ResNet50Beta with use_axial_beyond_stride = 2. + Axial-DeepLab-L is also equivalent to Axial-ResNet-L with an inception stem. + + Reference: + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + """ + + @classmethod + def _get_config(cls): + base_config = super(AxialDeepLabL, cls)._get_config() + override_config = { + 'use_axial_beyond_stride': 2, + } + return override(base_config, override_config) + + +class AxialDeepLabS(ResNet50Beta): + """Axial-DeepLab-S for panoptic segmentation. + + Axial-DeepLab-S is a ResNet50Beta with use_axial_beyond_stride = 2 and + width_multiplier = 0.5. + Axial-DeepLab-S is also equivalent to Axial-ResNet-S with an inception stem. + + Reference: + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + """ + + @classmethod + def _get_config(cls): + base_config = super(AxialDeepLabS, cls)._get_config() + override_config = { + 'width_multiplier': 0.5, + 'use_axial_beyond_stride': 2, + } + return override(base_config, override_config) + + +class SWideRNet(AxialResNetInstance): + """A SWideRNet instance. + + Note that the implementation is different from the original SWideRNet in: + (1) We apply strided convolutions in the first residual block of a stage, + instead of the last residual block. + (2) We replace the strided max pooling layer in the stem by applying strided + convolution in the immediate next residual block. + (3) We (optionally) use squeeze and excitation in all five stages, instead + of the last four stages only. + + Reference: + Scaling Wide Residual Networks for Panoptic Segmentation, + https://arxiv.org/abs/2011.11675 + Liang-Chieh Chen, Huiyu Wang, Siyuan Qiao. + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + """ + + @classmethod + def _get_config(cls): + return { + 'num_blocks': [3, 6, 3, 3], + 'classification_mode': True, + 'backbone_type': 'wider_resnet', + 'use_axial_beyond_stride': 0, + 'backbone_use_transformer_beyond_stride': 0, + 'block_group_config': { + 'drop_path_beyond_stride': 4, + 'conv_use_recompute_grad': True, + }, + } + + +class AxialSWideRNet(SWideRNet): + """SWideRNet with axial attention blocks in the last two stages. + + Note that the implementation is different from the original SWideRNet in: + (1) We apply strided convolutions in the first residual block of a stage, + instead of the last residual block. + (2) We replace the strided max pooling layer in the stem by applying strided + convolution in the immediate next residual block. + (3) We (optionally) use squeeze and excitation in all five stages, instead + of the last four stages only. + + Reference: + Scaling Wide Residual Networks for Panoptic Segmentation, + https://arxiv.org/abs/2011.11675 + Liang-Chieh Chen, Huiyu Wang, Siyuan Qiao. + Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. + """ + + @classmethod + def _get_config(cls): + base_config = super(AxialSWideRNet, cls)._get_config() + override_config = { + 'use_axial_beyond_stride': 16, + 'block_group_config': { + 'attention_bottleneck_expansion': 4, + 'axial_layer_config': { + 'key_expansion': 2, + 'value_expansion': 4, + }, + }, + } + return override(base_config, override_config) + + +def get_model(name, **kwargs): + """Gets the model instance given the model name.""" + name_lower = name.lower() + if name_lower == 'max_deeplab_s': + return MaXDeepLabS(name_lower, **kwargs) + elif name_lower == 'max_deeplab_l': + return MaXDeepLabL(name_lower, **kwargs) + elif name_lower == 'max_deeplab_s_backbone': + return MaXDeepLabSBackbone(name_lower, **kwargs) + elif name_lower == 'max_deeplab_l_backbone': + return MaXDeepLabLBackbone(name_lower, **kwargs) + elif name_lower == 'resnet50': + return ResNet50(name_lower, **kwargs) + elif name_lower == 'resnet50_beta': + return ResNet50Beta(name_lower, **kwargs) + elif name_lower == 'swidernet' or name_lower == 'wide_resnet41': + return SWideRNet(name_lower, **kwargs) + elif name_lower == 'axial_swidernet': + return AxialSWideRNet(name_lower, **kwargs) + elif name_lower == 'axial_resnet_s': + return AxialResNetS(name_lower, **kwargs) + elif name_lower == 'axial_resnet_l': + return AxialResNetL(name_lower, **kwargs) + elif name_lower == 'axial_deeplab_s': + return AxialDeepLabS(name_lower, **kwargs) + elif name_lower == 'axial_deeplab_l': + return AxialDeepLabL(name_lower, **kwargs) + else: + raise ValueError(name_lower + ' is not supported.') diff --git a/model/encoder/axial_resnet_instances_test.py b/model/encoder/axial_resnet_instances_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0a13f4a8eb02873b4088990faba87160ac1ed2c0 --- /dev/null +++ b/model/encoder/axial_resnet_instances_test.py @@ -0,0 +1,234 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for axial_resnet_instances.""" + +import os + +from absl import flags +from absl.testing import parameterized +import numpy as np +import tensorflow as tf + +from deeplab2.model import test_utils +from deeplab2.model.encoder import axial_resnet_instances + +FLAGS = flags.FLAGS + + +class AxialResnetInstancesTest(tf.test.TestCase, parameterized.TestCase): + + # The parameter count does not include the classification head. + @parameterized.parameters( + ('resnet50', 1, 23508032), + ('resnet50_beta', 1, 23631808), # 123776 more than resnet50 + ('max_deeplab_s_backbone', 1, 41343424), + ('max_deeplab_l_backbone', 1, 175115392), + ('axial_resnet_s', 1, 11466912), + ('axial_resnet_l', 1, 43714048), # 127872 fewer than axial_deeplab_l + ('axial_deeplab_s', 1, 11565856), + ('axial_deeplab_l', 1, 43841920), + ('swidernet', 1, 109014080), # SWideRNet-(1,1,1) without SE or SAC + ('swidernet', 3, 333245504), # Should be more than 3 x 109014080 + ('swidernet', 4.5, 487453760), # Rounded down to [13, 27, 13, 13] + ('axial_swidernet', 1, 136399392), + ('axial_swidernet', 3, 393935520), + ('axial_swidernet', 4.5, 570346912), + ) + def test_model_output_shape_and_num_params( + self, model_name, backbone_layer_multiplier, expected_num_params): + model = axial_resnet_instances.get_model( + model_name, + backbone_layer_multiplier=backbone_layer_multiplier, + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0001) + output = model(tf.keras.Input(shape=(224, 224, 3))) + if model_name in ('axial_resnet_s', 'axial_deeplab_s'): + self.assertListEqual(output['res5'].get_shape().as_list(), + [None, 14, 14, 1024]) + else: + self.assertListEqual(output['res5'].get_shape().as_list(), + [None, 14, 14, 2048]) + num_params = np.sum( + [np.prod(v.get_shape().as_list()) for v in model.trainable_weights]) + self.assertEqual(num_params, expected_num_params) + + def test_resnet50_variable_checkpoint_names(self): + model = axial_resnet_instances.get_model( + 'resnet50', + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0001) + model(tf.keras.Input(shape=(224, 224, 3))) + variable_names = [w.name for w in model.trainable_weights] + test_variable_name = 'resnet50/stage4/block6/conv3_bn/batch_norm/beta:0' + self.assertIn(test_variable_name, variable_names) + temp_dir = self.create_tempdir() + temp_path = os.path.join(temp_dir, 'ckpt') + checkpoint = tf.train.Checkpoint(encoder=model) + checkpoint.save(temp_path) + latest_checkpoint = tf.train.latest_checkpoint(temp_dir) + reader = tf.train.load_checkpoint(latest_checkpoint) + checkpoint_names = reader.get_variable_to_shape_map().keys() + test_checkpoint_name = 'encoder/_stage4/_block6/_conv3_bn/_batch_norm/gamma/.ATTRIBUTES/VARIABLE_VALUE' + self.assertIn(test_checkpoint_name, checkpoint_names) + + def test_max_deeplab_s_output_shape_and_num_params(self): + model = axial_resnet_instances.get_model( + 'max_deeplab_s', + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0001) + endpoints = model(tf.keras.Input(shape=(65, 65, 3))) + self.assertListEqual(endpoints['backbone_output'].get_shape().as_list(), + [None, 5, 5, 2048]) + self.assertListEqual( + endpoints['transformer_class_feature'].get_shape().as_list(), + [None, 128, 256]) + self.assertListEqual( + endpoints['transformer_mask_feature'].get_shape().as_list(), + [None, 128, 256]) + self.assertListEqual(endpoints['feature_panoptic'].get_shape().as_list(), + [None, 17, 17, 256]) + self.assertListEqual(endpoints['feature_semantic'].get_shape().as_list(), + [None, 5, 5, 2048]) + num_params = np.sum( + [np.prod(v.get_shape().as_list()) for v in model.trainable_weights]) + self.assertEqual(num_params, 61726624) + + def test_max_deeplab_l_output_shape_and_num_params(self): + model = axial_resnet_instances.get_model( + 'max_deeplab_l', + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0001) + endpoints = model(tf.keras.Input(shape=(65, 65, 3))) + self.assertListEqual(endpoints['backbone_output'].get_shape().as_list(), + [None, 5, 5, 2048]) + self.assertListEqual( + endpoints['transformer_class_feature'].get_shape().as_list(), + [None, 128, 512]) + self.assertListEqual( + endpoints['transformer_mask_feature'].get_shape().as_list(), + [None, 128, 512]) + self.assertListEqual(endpoints['feature_panoptic'].get_shape().as_list(), + [None, 17, 17, 256]) + self.assertListEqual(endpoints['feature_semantic'].get_shape().as_list(), + [None, 17, 17, 256]) + num_params = np.sum( + [np.prod(v.get_shape().as_list()) for v in model.trainable_weights]) + self.assertEqual(num_params, 450523232) + + def test_global_attention_absolute_positional_encoding_names(self): + model = axial_resnet_instances.get_model( + 'max_deeplab_s_backbone', + block_group_config={'use_global_beyond_stride': 16, + 'positional_encoding_type': '1D', + 'axial_layer_config': { + 'use_query_rpe_similarity': False, + 'use_key_rpe_similarity': False, + 'retrieve_value_rpe': False}}, + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0001) + model(tf.keras.Input(shape=(224, 224, 3))) + variable_names = [w.name for w in model.trainable_weights] + test_variable_name1 = 'max_deeplab_s_backbone/stage4/add_absolute_positional_encoding/height_axis_embeddings:0' + test_variable_name2 = 'max_deeplab_s_backbone/stage4/block2/attention/global/qkv_kernel:0' + self.assertIn(test_variable_name1, variable_names) + self.assertIn(test_variable_name2, variable_names) + + @parameterized.product( + (dict(model_name='resnet50', backbone_layer_multiplier=1), + dict(model_name='resnet50_beta', backbone_layer_multiplier=1), + dict(model_name='wide_resnet41', backbone_layer_multiplier=1), + dict(model_name='swidernet', backbone_layer_multiplier=2)), + output_stride=[4, 8, 16, 32]) + def test_model_atrous_consistency_with_output_stride_four( + self, model_name, backbone_layer_multiplier, output_stride): + tf.random.set_seed(0) + + # Create the input. + pixel_inputs = test_utils.create_test_input(1, 225, 225, 3) + + # Create the model and the weights. + model_1 = axial_resnet_instances.get_model( + model_name, + backbone_layer_multiplier=backbone_layer_multiplier, + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0001, + output_stride=4) + + # Create the weights. + model_1(pixel_inputs, training=False) + + # Set the batch norm gamma as non-zero so that the 3x3 convolution affects + # the output. + for weight in model_1.trainable_weights: + if '/gamma:0' in weight.name: + weight.assign(tf.ones_like(weight)) + + # Dense feature extraction followed by subsampling. + pixel_outputs = model_1(pixel_inputs, training=False)['res5'] + downsampling_stride = output_stride // 4 + expected = pixel_outputs[:, ::downsampling_stride, ::downsampling_stride, :] + + # Feature extraction at the nominal network rate. + model_2 = axial_resnet_instances.get_model( + model_name, + backbone_layer_multiplier=backbone_layer_multiplier, + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0001, + output_stride=output_stride) + # Create the weights. + model_2(pixel_inputs, training=False) + # Make the two networks use the same weights. + model_2.set_weights(model_1.get_weights()) + output = model_2(pixel_inputs, training=False)['res5'] + + # Normalize the outputs. Since we set batch_norm gamma to 1, the output + # activations can explode to a large standard deviation, which sometimes + # cause numerical errors beyond the tolerances. + normalizing_factor = tf.math.reduce_std(expected) + # Compare normalized outputs. + self.assertAllClose(output / normalizing_factor, + expected / normalizing_factor, + atol=1e-4, rtol=1e-4) + + @parameterized.parameters( + ('resnet50',), + ('resnet50_beta',), + ('max_deeplab_s_backbone',), + ('max_deeplab_l_backbone',), + ('axial_resnet_s',), + ('axial_resnet_l',), + ('axial_deeplab_s',), + ('axial_deeplab_l',), + ('swidernet',), + ('axial_swidernet',), + ) + def test_model_export(self, model_name): + model = axial_resnet_instances.get_model( + model_name, + output_stride=16, + backbone_layer_multiplier=1.0, + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0001, + # Disable drop path as it is not compatible with model exporting. + block_group_config={'drop_path_keep_prob': 1.0}) + model(tf.keras.Input([257, 257, 3], batch_size=1), training=False) + export_dir = os.path.join( + FLAGS.test_tmpdir, 'test_model_export', model_name) + model.save(export_dir) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/encoder/axial_resnet_test.py b/model/encoder/axial_resnet_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c50b66261951164560725bd530288cededfdb8cd --- /dev/null +++ b/model/encoder/axial_resnet_test.py @@ -0,0 +1,46 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for axial_resnet.""" + +import numpy as np +import tensorflow as tf + +from deeplab2.model.encoder import axial_resnet + + +class AxialResNetTest(tf.test.TestCase): + + def test_axial_resnet_correct_output_shape(self): + model = axial_resnet.AxialResNet('max_deeplab_s') + endpoints = model(tf.zeros([2, 65, 65, 3]), training=False) + self.assertListEqual(endpoints['backbone_output'].get_shape().as_list(), + [2, 5, 5, 2048]) + self.assertListEqual( + endpoints['transformer_class_feature'].get_shape().as_list(), + [2, 128, 256]) + self.assertListEqual( + endpoints['transformer_mask_feature'].get_shape().as_list(), + [2, 128, 256]) + self.assertListEqual(endpoints['feature_panoptic'].get_shape().as_list(), + [2, 17, 17, 256]) + self.assertListEqual(endpoints['feature_semantic'].get_shape().as_list(), + [2, 5, 5, 2048]) + num_params = np.sum( + [np.prod(v.get_shape().as_list()) for v in model.trainable_weights]) + self.assertEqual(num_params, 61726624) + +if __name__ == '__main__': + tf.test.main() diff --git a/model/encoder/mobilenet.py b/model/encoder/mobilenet.py new file mode 100644 index 0000000000000000000000000000000000000000..5bb1a8d1a3a1ac0c4a59b53f3c663d62cc95a689 --- /dev/null +++ b/model/encoder/mobilenet.py @@ -0,0 +1,410 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MobileNetV3 models for Deep Labeling. + +Reference: + Howard, A., Sandler, M., et al. Searching for mobilenetv3. In ICCV, 2019 +""" +from typing import Any, Callable, Mapping, Optional, Sequence + +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import blocks +from deeplab2.model.layers import convolutions + +# The default input image channels. +_INPUT_CHANNELS = 3 + + +MNV3Small_BLOCK_SPECS = { + 'spec_name': 'MobileNetV3Small', + 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', + 'activation', 'se_ratio', 'expand_ratio', + 'is_endpoint'], + 'block_specs': [ + ('conv_bn', 3, 2, 16, + 'hard_swish', None, None, True), + ('inverted_bottleneck', 3, 2, 16, + 'relu', 0.25, 1, True), + ('inverted_bottleneck', 3, 2, 24, + 'relu', None, 72. / 16, False), + ('inverted_bottleneck', 3, 1, 24, + 'relu', None, 88. / 24, True), + ('inverted_bottleneck', 5, 2, 40, + 'hard_swish', 0.25, 4., False), + ('inverted_bottleneck', 5, 1, 40, + 'hard_swish', 0.25, 6., False), + ('inverted_bottleneck', 5, 1, 40, + 'hard_swish', 0.25, 6., False), + ('inverted_bottleneck', 5, 1, 48, + 'hard_swish', 0.25, 3., False), + ('inverted_bottleneck', 5, 1, 48, + 'hard_swish', 0.25, 3., True), + ('inverted_bottleneck', 5, 2, 96, + 'hard_swish', 0.25, 6., False), + ('inverted_bottleneck', 5, 1, 96, + 'hard_swish', 0.25, 6., False), + ('inverted_bottleneck', 5, 1, 96, + 'hard_swish', 0.25, 6., False), + ('conv_bn', 1, 1, 576, + 'hard_swish', None, None, True), + ] +} + + +MNV3Large_BLOCK_SPECS = { + 'spec_name': 'MobileNetV3Large', + 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', + 'activation', 'se_ratio', 'expand_ratio', + 'is_endpoint'], + 'block_specs': [ + ('conv_bn', 3, 2, 16, + 'hard_swish', None, None, False), + ('inverted_bottleneck', 3, 1, 16, + 'relu', None, 1., True), + ('inverted_bottleneck', 3, 2, 24, + 'relu', None, 4., False), + ('inverted_bottleneck', 3, 1, 24, + 'relu', None, 3., True), + ('inverted_bottleneck', 5, 2, 40, + 'relu', 0.25, 3., False), + ('inverted_bottleneck', 5, 1, 40, + 'relu', 0.25, 3., False), + ('inverted_bottleneck', 5, 1, 40, + 'relu', 0.25, 3., True), + ('inverted_bottleneck', 3, 2, 80, + 'hard_swish', None, 6., False), + ('inverted_bottleneck', 3, 1, 80, + 'hard_swish', None, 2.5, False), + ('inverted_bottleneck', 3, 1, 80, + 'hard_swish', None, 2.3, False), + ('inverted_bottleneck', 3, 1, 80, + 'hard_swish', None, 2.3, False), + ('inverted_bottleneck', 3, 1, 112, + 'hard_swish', 0.25, 6., False), + ('inverted_bottleneck', 3, 1, 112, + 'hard_swish', 0.25, 6., True), + ('inverted_bottleneck', 5, 2, 160, + 'hard_swish', 0.25, 6., False), + ('inverted_bottleneck', 5, 1, 160, + 'hard_swish', 0.25, 6., False), + ('inverted_bottleneck', 5, 1, 160, + 'hard_swish', 0.25, 6., False), + ('conv_bn', 1, 1, 960, + 'hard_swish', None, None, True), + ] +} + + +SUPPORTED_SPECS_MAP = { + 'MobileNetV3Large': MNV3Large_BLOCK_SPECS, + 'MobileNetV3Small': MNV3Small_BLOCK_SPECS, +} + + +# pylint: disable=invalid-name +def _block_spec_decoder(specs: Mapping[Any, Any], + width_multiplier: float, + divisible_by: int = 8) -> Sequence[Mapping[str, Any]]: + """Decodes specs for a block. + + Args: + specs: A `dict` specification of block specs of a mobilenet version. + width_multiplier: A `float` multiplier for the filter size for all + convolution ops. The value must be greater than zero. Typical usage will + be to set this value in (0, 1) to reduce the number of parameters or + computation cost of the model. + divisible_by: An `int` that ensures all inner dimensions are divisible by + this number. + + Returns: + A list of block spec in dictionary that defines structure of the layers. + """ + + spec_name = specs['spec_name'] + block_spec_schema = specs['block_spec_schema'] + block_specs = specs['block_specs'] + + if not block_specs: + raise ValueError( + 'The block spec cannot be empty for {} !'.format(spec_name)) + + if len(block_specs[0]) != len(block_spec_schema): + raise ValueError('The block spec values {} do not match with ' + 'the schema {}'.format(block_specs[0], block_spec_schema)) + + decoded_specs = [] + + for spec in block_specs: + spec_dict = dict(zip(block_spec_schema, spec)) + decoded_specs.append(spec_dict) + + for ds in decoded_specs: + ds['filters'] = utils.make_divisible( + value=ds['filters'] * width_multiplier, + divisor=divisible_by, + min_value=8) + + return decoded_specs +# pylint: enable=invalid-name + + +class MobileNet(tf.keras.Model): + """Creates a MobileNetV3 family model.""" + + def __init__( + self, + model_id: str = 'MobileNetV3Small', + width_multiplier: float = 1.0, + output_stride: Optional[int] = None, + min_width: int = 8, + divisible_by: int = 8, + regularize_depthwise: bool = False, + bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay: float = 0.0, + name: str = 'MobilenNetV3'): + """Initializes a MobileNet V3 model. + + Args: + model_id: A `str` of MobileNet version. The supported values are + `MobileNetV3Large`, `MobileNetV3Small`. + width_multiplier: A `float` of multiplier for the filters (number of + channels) for all convolution ops. The value must be greater than zero. + Typical usage will be to set this value in (0, 1) to reduce the number + of parameters or computation cost of the model. + output_stride: An `int` that specifies the requested ratio of input to + output spatial resolution. If not None, then we invoke atrous + convolution if necessary to prevent the network from reducing the + spatial resolution of activation maps. The output_stride should be + divisible by 4. + min_width: An `int` of minimum width (number of channels) for all + convolution ops. Enforced when width_multiplier < 1, and not an active + constraint when width_multiplier >= 1. + divisible_by: An `int` that ensures all intermediate feature dimensions + are divisible by this number. + regularize_depthwise: If True, apply regularization on depthwise conv. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + name: Model name. + + Raises: + ValueError: The MobileNet version is not supported. + ValueError: width_multiplier is not greater than zero. + ValueError: Output stride must be None or a multiple of 4. + ValueError: Unknown block type i for layer j. + """ + if model_id not in SUPPORTED_SPECS_MAP: + raise ValueError('The MobileNet version {} ' + 'is not supported'.format(model_id)) + + if width_multiplier <= 0: + raise ValueError('width_multiplier is not greater than zero.') + + if (output_stride is not None and + (output_stride <= 1 or (output_stride > 1 and output_stride % 4))): + raise ValueError('Output stride must be None or a multiple of 4.') + + super().__init__(name=name) + + self._model_id = model_id + self._width_multiplier = width_multiplier + self._min_width = min_width + self._output_stride = output_stride + self._divisible_by = divisible_by + self._regularize_depthwise = regularize_depthwise + self._bn_layer = bn_layer + self._conv_kernel_weight_decay = conv_kernel_weight_decay + self._blocks = [] + self._endpoint_names = [] + + block_specs = SUPPORTED_SPECS_MAP.get(model_id) + self._decoded_specs = _block_spec_decoder( + specs=block_specs, + width_multiplier=self._width_multiplier, + divisible_by=self._divisible_by) + + self._mobilenet_base() + + def _mobilenet_base(self): + """Builds the base MobileNet architecture.""" + + # The current_stride variable keeps track of the output stride of the + # activations, i.e., the running product of convolution strides up to the + # current network layer. This allows us to invoke atrous convolution + # whenever applying the next convolution would result in the activations + # having output stride larger than the target output_stride. + current_stride = 1 + + # The atrous convolution rate parameter. + rate = 1 + + endpoint_level = 1 + in_filters = _INPUT_CHANNELS + for i, block_def in enumerate(self._decoded_specs): + # We only need to build up to 'res5' endpoint for segmentation task. + if endpoint_level > 5 and not self._classification_mode: + break + + block_name = '{}_{}'.format(block_def['block_fn'], i + 1) + + if (self._output_stride is not None and + current_stride == self._output_stride): + # If we have reached the target output_stride, then we need to employ + # atrous convolution with stride=1 and multiply the atrous rate by the + # current unit's stride for use in subsequent layers. + layer_stride = 1 + layer_rate = rate + rate = ( + rate * block_def['strides'] + if block_def['strides'] is not None else rate) + else: + layer_stride = block_def['strides'] + layer_rate = 1 + current_stride = ( + current_stride * block_def['strides'] + if block_def['strides'] is not None else current_stride) + + if block_def['block_fn'] == 'conv_bn': + + self._blocks.append( + convolutions.Conv2DSame( + output_channels=block_def['filters'], + kernel_size=block_def['kernel_size'], + strides=layer_stride, + atrous_rate=layer_rate, + activation=block_def['activation'], + use_bias=False, + bn_layer=self._bn_layer, + use_bn=True, + conv_kernel_weight_decay=self._conv_kernel_weight_decay, + name=block_name, + )) + + elif block_def['block_fn'] == 'inverted_bottleneck': + atrous_rate = 1 + # There is no need to apply atrous convolution to any 1x1 convolution. + if layer_rate > 1 and block_def['kernel_size'] != 1: + atrous_rate = layer_rate + self._blocks.append( + blocks.InvertedBottleneckBlock( + in_filters=in_filters, + out_filters=block_def['filters'], + expand_ratio=block_def['expand_ratio'], + strides=layer_stride, + kernel_size=block_def['kernel_size'], + se_ratio=block_def['se_ratio'], + activation=block_def['activation'], + expand_se_in_filters=True, + depthwise_activation=None, + atrous_rate=atrous_rate, + divisible_by=self._divisible_by, + regularize_depthwise=self._regularize_depthwise, + use_depthwise=True, + # Note that whether the residual connection would be used is + # also conditional on the in_filters and out_filters size, even + # if use_residual=True,e.g. when input_filters != out_filters, + # no residual connection will be created. + use_residual=(block_def['strides'] == 1), + bn_layer=self._bn_layer, + conv_kernel_weight_decay=self._conv_kernel_weight_decay, + name=block_name, + )) + + else: + raise ValueError('Unknown block type {} for layer {}'.format( + block_def['block_fn'], i)) + + # Register input_filters for the next level + in_filters = block_def['filters'] + + if block_def['is_endpoint']: + # Name the endpoint to be 'res{1...5}' to align with ResNet. This + # simplifies segmentation head implementation. + self._endpoint_names.append('res' + str(endpoint_level)) + endpoint_level += 1 + else: + self._endpoint_names.append(None) + + def call(self, input_tensor: tf.Tensor, training: bool = False): + """Performs a forward pass through MobileNet.""" + net = input_tensor + endpoints = {} + for block, endpoint_name in zip(self._blocks, self._endpoint_names): + net = block(net, training=training) + if endpoint_name is not None: + endpoints[endpoint_name] = net + return endpoints + + +def MobileNetV3Small( + width_multiplier: float = 1.0, + output_stride: int = 32, + bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay: float = 0.0, + name: str = 'MobileNetV3Small') -> tf.keras.Model: + """Creates a MobileNetV3Small model. + + Args: + width_multiplier: A float, depth_multiplier for the whole model. + output_stride: An optional integer specifying the output stride of the + network. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + conv_kernel_weight_decay: A float, the weight decay for convolution kernels. + name: Model name. + + Returns: + The MobileNetV3Small model as an instance of tf.keras.Model. + """ + model = MobileNet(model_id='MobileNetV3Small', + width_multiplier=width_multiplier, + output_stride=output_stride, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay, + name=name) + return model + + +def MobileNetV3Large( + width_multiplier: float = 1.0, + output_stride: int = 32, + bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay: float = 0.0, + name: str = 'MobileNetV3Large') -> tf.keras.Model: + """Creates a MobileNetV3Large model. + + Args: + width_multiplier: A float, depth_multiplier for the STEM. + output_stride: An optional integer specifying the output stride of the + network. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + conv_kernel_weight_decay: A float, the weight decay for convolution kernels. + name: Model name. + + Returns: + The MobileNetV3Large model as an instance of tf.keras.Model. + """ + model = MobileNet(model_id='MobileNetV3Large', + width_multiplier=width_multiplier, + output_stride=output_stride, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay, + name=name) + return model diff --git a/model/encoder/mobilenet_test.py b/model/encoder/mobilenet_test.py new file mode 100644 index 0000000000000000000000000000000000000000..fced57f8d09f808a4fb2bc16e9c56e7ceade1846 --- /dev/null +++ b/model/encoder/mobilenet_test.py @@ -0,0 +1,129 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for mobilenet.""" + +from absl.testing import parameterized + +import tensorflow as tf + +from deeplab2.model import test_utils +from deeplab2.model.encoder import mobilenet + + +class MobilenetTest(tf.test.TestCase, parameterized.TestCase): + + @parameterized.parameters('MobileNetV3Small', 'MobileNetV3Large') + def test_mobilenetv3_construct_graph(self, model_name): + tf.keras.backend.set_image_data_format('channels_last') + input_size = 128 + + mobilenet_models = { + 'MobileNetV3Small': mobilenet.MobileNetV3Small, + 'MobileNetV3Large': mobilenet.MobileNetV3Large, + } + mobilenet_channels = { + # The number of filters of layers having outputs been collected + # for filter_size_scale = 1.0 + 'MobileNetV3Small': [16, 24, 48, 576], + 'MobileNetV3Large': [24, 40, 112, 960], + } + network = mobilenet_models[str(model_name)](width_multiplier=1.0) + + inputs = tf.ones([1, input_size, input_size, 3]) + endpoints = network(inputs) + + for idx, num_filter in enumerate(mobilenet_channels[model_name]): + self.assertAllEqual( + [1, input_size / 2 ** (idx+2), input_size / 2 ** (idx+2), num_filter], + endpoints['res'+str(idx+2)].shape.as_list()) + + @parameterized.product( + model_name=['MobileNetV3Small', 'MobileNetV3Large'], + output_stride=[4, 8, 16, 32]) + def test_mobilenetv3_atrous_endpoint_shape(self, model_name, output_stride): + tf.keras.backend.set_image_data_format('channels_last') + input_size = 321 + batch_size = 2 + + mobilenet_models = { + 'MobileNetV3Small': mobilenet.MobileNetV3Small, + 'MobileNetV3Large': mobilenet.MobileNetV3Large, + } + stride_spatial_shapes_map = { + 4: [81, 81, 81, 81], + 8: [81, 41, 41, 41], + 16: [81, 41, 21, 21], + 32: [81, 41, 21, 11], + } + mobilenet_channels = { + # The number of filters of layers having outputs been collected + # for filter_size_scale = 1.0 + 'MobileNetV3Small': [16, 24, 48, 576], + 'MobileNetV3Large': [24, 40, 112, 960], + } + network = mobilenet_models[str(model_name)]( + width_multiplier=1.0, + output_stride=output_stride) + spatial_shapes = stride_spatial_shapes_map[output_stride] + + inputs = tf.ones([batch_size, input_size, input_size, 3]) + endpoints = network(inputs) + + for idx, num_filters in enumerate(mobilenet_channels[model_name]): + expected_shape = [ + batch_size, spatial_shapes[idx], spatial_shapes[idx], num_filters + ] + self.assertAllEqual( + expected_shape, + endpoints['res'+str(idx+2)].shape.as_list()) + + @parameterized.parameters('MobileNetV3Small', 'MobileNetV3Large') + def test_mobilenet_reload_weights(self, model_name): + tf.keras.backend.set_image_data_format('channels_last') + mobilenet_models = { + 'MobileNetV3Small': mobilenet.MobileNetV3Small, + 'MobileNetV3Large': mobilenet.MobileNetV3Large, + } + + tf.random.set_seed(0) + pixel_inputs = test_utils.create_test_input(1, 320, 320, 3) + + network1 = mobilenet_models[model_name]( + width_multiplier=1.0, + output_stride=32, + name='m1') + network1(pixel_inputs, False) + outputs1 = network1(pixel_inputs, False) + pixel_outputs = outputs1['res5'] + + # Feature extraction at the normal network rate. + network2 = mobilenet_models[model_name]( + width_multiplier=1.0, + output_stride=32, + name='m2') + network2(pixel_inputs, False) + # Make the two networks use the same weights. + network2.set_weights(network1.get_weights()) + outputs2 = network2(pixel_inputs, False) + expected = outputs2['res5'] + + self.assertAllClose(network1.get_weights(), network2.get_weights(), + atol=1e-4, rtol=1e-4) + self.assertAllClose(pixel_outputs, expected, atol=1e-4, rtol=1e-4) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/__init__.py b/model/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03 --- /dev/null +++ b/model/layers/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/model/layers/activations.py b/model/layers/activations.py new file mode 100644 index 0000000000000000000000000000000000000000..1b47a4378440dee008f5f176856906b0f6716046 --- /dev/null +++ b/model/layers/activations.py @@ -0,0 +1,132 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Defines a set of useful activation functions.""" +import functools +import tensorflow as tf + + +def gelu(input_tensor, approximate=False): + """Gaussian Error Linear Unit. + + Reference: + Gaussian Error Linear Units (GELUs), Dan Hendrycks, Kevin Gimpel, arXiv 2016. + + Args: + input_tensor: A tensor with an arbitrary shape. + approximate: A boolean, whether to enable approximation. + + Returns: + The activated input tensor. + """ + return tf.keras.activations.gelu(input_tensor, approximate=approximate) + + +def hard_sigmoid(input_tensor): + """Hard sigmoid activation function. + + Args: + input_tensor: A tensor with an arbitrary shape. + + Returns: + The activated input tensor. + """ + input_tensor = tf.convert_to_tensor(input_tensor) + return tf.nn.relu6(input_tensor + tf.constant(3.)) * 0.16667 + + +def relu6(input_tensor): + """Relu6 activation function. + + Args: + input_tensor: A tensor with an arbitrary shape. + + Returns: + The activated input tensor. + """ + input_tensor = tf.convert_to_tensor(input_tensor) + return tf.nn.relu6(input_tensor) + + +def swish(input_tensor): + """Swish or SiLU activation function. + + Args: + input_tensor: A tensor with an arbitrary shape. + + Returns: + The activated input tensor. + """ + input_tensor = tf.convert_to_tensor(input_tensor) + return tf.nn.silu(input_tensor) + + +def hard_swish(input_tensor): + """Hard Swish function. + + Args: + input_tensor: A tensor with an arbitrary shape. + + Returns: + The activated input tensor. + """ + input_tensor = tf.convert_to_tensor(input_tensor) + return input_tensor * tf.nn.relu6( + input_tensor + tf.constant(3.)) * (1. / 6.) + + +def identity(input_tensor): + """Identity function. + + Useful for helping in quantization. + + Args: + input_tensor: A tensor with an arbitrary shape. + + Returns: + The activated input tensor. + """ + input_tensor = tf.convert_to_tensor(input_tensor) + return tf.identity(input_tensor) + + +def get_activation(identifier): + """Gets activation function via input identifier. + + This function returns the specified customized activation function, if there + is any. Otherwise, tf.keras.activations.get is called. + + Args: + identifier: A string, name of the activation function. + + Returns: + The specified activation function. + """ + if isinstance(identifier, str): + name_to_fn = { + 'gelu': functools.partial(gelu, approximate=False), + 'approximated_gelu': functools.partial(gelu, approximate=True), + 'silu': swish, + 'swish': swish, + 'hard_swish': hard_swish, + 'relu6': relu6, + 'hard_sigmoid': hard_sigmoid, + 'identity': identity, + 'none': identity, + } + identifier = str(identifier).lower() + if identifier in name_to_fn: + return name_to_fn[identifier] + return tf.keras.activations.get(identifier) diff --git a/model/layers/activations_test.py b/model/layers/activations_test.py new file mode 100644 index 0000000000000000000000000000000000000000..d0c867c9fcad5218e28f3fb4f082274d1c48c173 --- /dev/null +++ b/model/layers/activations_test.py @@ -0,0 +1,36 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for activations.py.""" +import tensorflow as tf + +from deeplab2.model.layers import activations + + +class ActivationsTest(tf.test.TestCase): + + def test_gelu(self): + expected_data = [[0.14967535, 0., -0.10032465], + [-0.15880796, -0.04540223, 2.9963627]] + gelu_data = activations.gelu([[.25, 0, -.25], [-1, -2, 3]], + approximate=True) + self.assertAllClose(expected_data, gelu_data) + gelu_data_via_get_activation = activations.get_activation( + 'approximated_gelu')([[.25, 0, -.25], [-1, -2, 3]]) + self.assertAllClose(expected_data, gelu_data_via_get_activation) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/axial_block_groups.py b/model/layers/axial_block_groups.py new file mode 100644 index 0000000000000000000000000000000000000000..594b26381fc99960f6dd5c656b0b63a71a4be6bb --- /dev/null +++ b/model/layers/axial_block_groups.py @@ -0,0 +1,443 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implements convolutional and attentional residual block groups.""" + +import math +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import activations +from deeplab2.model.layers import axial_blocks +from deeplab2.model.layers import drop_path +from deeplab2.model.layers import dual_path_transformer +from deeplab2.model.layers import positional_encodings +from deeplab2.model.layers import recompute_grad as recompute_grad_lib + +# We will apply 10x larger learning rates on transformer layers. This global +# variable name will be accessed when we build the optimizers. This keyword is +# reserved and should not be a part of the variable names in a classification +# pretrained backbone. +TRANSFORMER = 'transformer' + + +def _get_current_names(index): + current_name = '_block{}'.format(index + 1) + transformer_current_name = '_block{}_{}'.format(index + 1, TRANSFORMER) + return current_name, transformer_current_name + + +class BlockGroup(tf.keras.layers.Layer): + """Applies a group of residual blocks with dual path transformer layers [1]. + + An optional dual-path transformer layer is inserted after each residual block. + The transformer layer performs memory2pixel attention, pixel2memory attention, + and memory2memory self-attention, while the standard residual block applies + the pixel2pixel axial-attention, global-attention, or spatial convolution. + + Reference: + [1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers, + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + """ + + def __init__(self, + filters, + num_blocks, + name, + original_resnet_stride, + original_resnet_input_stride, + output_stride=16, + backbone_type='resnet_beta', + positional_encoding_type=None, + use_global_beyond_stride=0, + use_axial_beyond_stride=16, + use_transformer_beyond_stride=32, + use_sac_beyond_stride=0, + use_squeeze_and_excite=False, + conv_use_recompute_grad=False, + axial_use_recompute_grad=True, + recompute_within_stride=0, + transformer_use_recompute_grad=False, + transformer_expansion=1, + drop_path_keep_prob=0.8, + drop_path_beyond_stride=16, + drop_path_schedule='constant', + activation='relu', + attention_bottleneck_expansion=2, + axial_layer_config=None, + dual_path_transformer_layer_config=None, + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0): + """Initializes a BlockGroup layer. + + Args: + filters: An integer, the base number of channels for this block group. + num_blocks: An integer, the number of blocks for this block group. + name: A string, the name of the block group. + original_resnet_stride: An integer, the original resnet stride for this + block, usually 1 or 2. The stride will be applied if + original_resnet_input_stride is smaller than the desired output_stride. + Otherwise, the stride will not be applied, and atrous convolution will + be used after the first block. + original_resnet_input_stride: An integer, the total input stride in the + original resnet. For example, the total input stride for the last stage + of the original resnet is 16, and the total output stride is 32. This + stride differs from the true stride of the feature in that we might use + atrous convolution to change both the input and output stride to, e.g. + 8, but its original resnet input stride remains the same. In this case, + we also use the original resnet input stride to compute the atrous rate. + output_stride: An integer, the desired output_stride for the ResNet. + backbone_type: A string, the type of the backbone. Supports 'resnet', + 'resnet_beta', and 'wider_resnet'. The 'resnet' refers to the original + resnet with a 7x7 convolutional stem. The 'resnet_beta' means a resnet + but with an inception stem. The 'wider_resnet' is a wider variant of + resnet with extensively used 3x3 convolutions. + positional_encoding_type: A string, type of the positional encoding. + Support '2D', '1D', and None. + use_global_beyond_stride: An integer, the stride beyond which we use + global attention. Set to 0 if no global attention is desired. Defaults + to 0, i.e. we do not use global attention. + use_axial_beyond_stride: An integer, the stride beyond which we use axial + attention. Note that use_global_beyond_stride has a higher priority, + i.e. we use global attention if the stride is also beyond + use_global_beyond_stride. Set to 0 if no axial attention is desired. + Defaults to 16 as in MaX-DeepLab. + use_transformer_beyond_stride: An integer, the stride beyond which we use + a transformer layer. Set to 0 if no transformer is desired. Defaults to + 32 as in MaX-DeepLab-S. + use_sac_beyond_stride: An integer. Use the Switchable Atrous Convolution + (SAC) beyond the specified stride. For example, if + `use_sac_beyond_stride` = 16, SAC will be applied to the network stage + whose output stride >= 16 (i.e., 16 and 32). Set to 0 or -1 to disable + it. Defaults to 0 as SAC is not used in MaX-DeepLab. + use_squeeze_and_excite: A boolean, whether squeeze-and-excite (SE) is + used. Defaults to False as SE is not used in MaX-DeepLab. + conv_use_recompute_grad: A boolean, whether to use the gradient + checkpointing trick for convolutional blocks. This trick reduces + accelerator memory usage, but takes longer to compute gradients. + Defaults to False since convolutional layers are memory efficient. + axial_use_recompute_grad: A boolean, whether to use the gradient + checkpointing trick for axial blocks. This trick reduces accelerator + memory usage, but takes longer to compute gradients. Defaults to True + since it saves memory for axial blocks. + recompute_within_stride: An integer, the stride within which we use the + gradient checkpointing trick. This trick reduces accelerator memory + usage, but takes longer to compute gradients. Defaults to 0 (do not + recompute any layer). + transformer_use_recompute_grad: A boolean, whether to use the gradient + checkpointing trick for dual-path transformer blocks. This trick reduces + accelerator memory usage, but takes longer to compute gradients. + Defaults to False. + transformer_expansion: An integer, the expansion ratio for the transformer + bottleneck. + drop_path_keep_prob: A float, the keep probability for dropping path. + Defaults to 0.8 as in MaX-DeepLab-S. + drop_path_beyond_stride: An integer, the stride beyond which we apply drop + path augmentation. Defaults to 16 as in MaX-DeepLab-S. + drop_path_schedule: A string, the drop path schedule. Currently, we + support 'constant': use the same drop path keep probability for all + stages, and 'linear': linearly decrease the drop path keep probability + from 1.0 at 0-th stage (or STEM) to `drop_path_keep_prob` at last stage. + activation: A string, type of activation function to apply. Support + 'relu', 'swish' (or 'silu'), 'gelu', 'approximated_gelu', and 'elu'. + attention_bottleneck_expansion: An integer, the expansion ratio for + axial attention blocks. + axial_layer_config: A dict, an argument dictionary for the axial layer. + dual_path_transformer_layer_config: A dict, an argument dictionary for the + transformer. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + + Raises: + ValueError: If backbone_type is not one of 'resnet', 'resnet_beta', or + 'wider_resnet'. + ValueError: original_resnet_input_stride is not power of 2. + ValueError: output_stride is not power of 2. + """ + if original_resnet_input_stride & (original_resnet_input_stride - 1): + raise ValueError('original_resnet_input_stride is not power of 2.') + if output_stride & (output_stride - 1): + raise ValueError('output_stride is not power of 2.') + + super(BlockGroup, self).__init__(name=name) + self._add_absolute_positional_encoding = None + self._activation_fn = activations.get_activation(activation) + self._num_blocks = num_blocks + self._drop_path_keep_prob = [] + self._recompute_grad = [] + self._transformer_use_recompute_grad = transformer_use_recompute_grad + if dual_path_transformer_layer_config is None: + dual_path_transformer_layer_config = {} + original_resnet_current_stride = original_resnet_input_stride + + use_sac = (original_resnet_input_stride * original_resnet_stride >= + use_sac_beyond_stride > 0) + + recompute_grad = (original_resnet_input_stride * original_resnet_stride <= + recompute_within_stride) + + for index in range(num_blocks): + current_name, transformer_current_name = _get_current_names(index) + + # Compute the current strides. If there is a stride for this block group, + # we do it in the first residual block. + if index == 0 and original_resnet_input_stride < output_stride: + current_strides = original_resnet_stride + else: + current_strides = 1 + + # Compute the current atrous rate. + if original_resnet_current_stride > output_stride: + atrous_rate = original_resnet_current_stride // output_stride + else: + atrous_rate = 1 + + # Compute the atrous rate for the second conv in the first basic block. + if (index == 0 and original_resnet_input_stride * original_resnet_stride > + output_stride): + basic_block_second_conv_atrous_rate = ( + original_resnet_input_stride * original_resnet_stride // + output_stride) + else: + basic_block_second_conv_atrous_rate = atrous_rate + + # Compute the current drop_path_keep_prob. + current_stage = math.log2(original_resnet_current_stride) - 1 + if original_resnet_current_stride >= drop_path_beyond_stride: + current_drop_path_keep_prob = drop_path.get_drop_path_keep_prob( + drop_path_keep_prob, drop_path_schedule, + current_stage=int(round(current_stage)), + num_stages=4) + else: + current_drop_path_keep_prob = 1.0 + + # Compute which block_fn to use for this residual block. + if original_resnet_current_stride >= use_global_beyond_stride > 0: + attention_type = 'global' + recompute_grad = axial_use_recompute_grad or recompute_grad + filters_list = [filters * attention_bottleneck_expansion, + filters, + filters * 4] + elif original_resnet_current_stride >= use_axial_beyond_stride > 0: + attention_type = 'axial' + recompute_grad = axial_use_recompute_grad or recompute_grad + filters_list = [filters * attention_bottleneck_expansion, + filters, + filters * 4] + elif backbone_type == 'resnet' or backbone_type == 'resnet_beta': + attention_type = None + recompute_grad = conv_use_recompute_grad or recompute_grad + filters_list = [filters, + filters, + filters * 4] + elif backbone_type == 'wider_resnet': + if original_resnet_input_stride * original_resnet_stride < 32: + # Wider-ResNet uses conv basic blocks except the last stage. + attention_type = None + recompute_grad = conv_use_recompute_grad or recompute_grad + filters_list = [filters * 4, + filters * 4] + else: + # Wider-ResNet uses an expanded bottleneck block in the last stage. + attention_type = None + recompute_grad = conv_use_recompute_grad or recompute_grad + filters_list = [filters, + filters * 2, + filters * 4] + else: + raise ValueError(backbone_type + ' is not supported.') + + self._drop_path_keep_prob.append(current_drop_path_keep_prob) + # Apply the residual block. + # The inputs to block_fn should be activated features. + block_fn = axial_blocks.AxialBlock( + filters_list, + kernel_size=3, + strides=current_strides, + atrous_rate=atrous_rate, + use_squeeze_and_excite=use_squeeze_and_excite, + use_sac=use_sac, + bn_layer=bn_layer, + activation=activation, + name=current_name[1:], + conv_kernel_weight_decay=conv_kernel_weight_decay, + basic_block_second_conv_atrous_rate=( + basic_block_second_conv_atrous_rate), + attention_type=attention_type, + axial_layer_config=axial_layer_config) + self._recompute_grad.append(recompute_grad) + utils.safe_setattr(self, current_name, block_fn) + + # Modify the original_resnet_stride according to the strides. + if index == 0 and original_resnet_stride > 1: + original_resnet_current_stride *= original_resnet_stride + # Add absolute positional encoding if we will apply global attention + # beyond this stride. + if original_resnet_current_stride == use_global_beyond_stride > 0: + self._add_absolute_positional_encoding = ( + positional_encodings.AddAbsolutePositionalEncoding( + 'add_absolute_positional_encoding', + positional_encoding_type, bn_layer, conv_kernel_weight_decay)) + if original_resnet_current_stride >= use_transformer_beyond_stride > 0: + # Apply a dual-path transformer. + transformer_block_fn = dual_path_transformer.DualPathTransformerLayer( + name=transformer_current_name[1:], + filters=int(128 * transformer_expansion), + activation=activation, + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay, + **dual_path_transformer_layer_config) + utils.safe_setattr(self, transformer_current_name, transformer_block_fn) + else: + utils.safe_setattr(self, transformer_current_name, None) + # Avoid using recompute_grad for the first call that builds the sub-layers. + # Otherwise, recompute_grad will not track newly built model parameters. + self._first_building_call = True + + def call(self, inputs, training=False): + """Performs a forward pass. + + Args: + inputs: two tensors. The first tensor is a pixel_space_input with shape + [batch, height, width, pixel_channels]. The second tensor is + memory_space_input with shape [batch, length, memory_channels]. This + input will be used only if a transformer is used. Otherwise, the input + is returned unmodified. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + output: An output [batch, height, width, filters * 4] tensor. + activated_output: An activated output [batch, height, width, filters * 4] + tensor. + memory_space_output: A memory space output [batch, length, + memory_channels] tensor. + """ + # The pixel space inputs are activated features. + activated_features, memory_space_output = inputs + + # Recompute_grad takes only float tensors as inputs. It does not allow + # bools or boolean tensors. For this reason, we cast training to a float + # tensor and cast it back after we go through the recompute_grad wrap. + float_tensor_training = tf.cast(training, tf.float32) + + for index in range(self._num_blocks): + current_name, transformer_current_name = _get_current_names(index) + block_fn_no_recompute = getattr( + self, current_name) + transformer_block_fn_no_recompute = getattr( + self, transformer_current_name) + current_drop_path_keep_prob = self._drop_path_keep_prob[index] + + # Wrap the layer if we want to recompute it in the backward pass. + if (self._recompute_grad[index] and training): + # The seed is not actually used since we do not have any random + # operation in the recomputed function. The purpose of the provided seed + # is to prevent recompute_grad from generating a new seed variable which + # is not compatible with model exporting. + block_fn = recompute_grad_lib.recompute_grad( + block_fn_no_recompute, seed=tf.constant(0, tf.int32)) + else: + block_fn = block_fn_no_recompute + + # The inputs to block_fn should be activated features. + block_fn_inputs = [activated_features, float_tensor_training] + # We have to define drop_path_masks outside the layer call and pass it + # into the layer, because tf.recompute_grad (gradient checkpointing) does + # not allow any randomness within the function call. In addition, + # recompute_grad functions can only take Tensors as inputs, so we do not + # pass the drop_path_random_mask (when it is None) into block_fn. + if current_drop_path_keep_prob < 1.0 and training: + drop_path_random_mask = drop_path.generate_drop_path_random_mask( + activated_features, current_drop_path_keep_prob) + + block_fn_inputs.append(drop_path_random_mask) + + # Build the sub-layers when the block_fn is called for the first time. + # Otherwise, recompute_grad will not track newly built model parameters. + if self._first_building_call: + _ = block_fn_no_recompute(tuple(block_fn_inputs)) + # Apply the residual block. + features, activated_features = block_fn(tuple(block_fn_inputs)) + + if index == 0 and self._add_absolute_positional_encoding is not None: + features = self._add_absolute_positional_encoding(features, + training=training) + activated_features = self._activation_fn(features) + + if transformer_block_fn_no_recompute is not None: + # Reshape pixel space features from 4D to 3D. + _, height, width, channels = features.get_shape().as_list() + features = tf.reshape( + features, [-1, height * width, channels]) + + # Wrap the layer if we want to recompute it in the backward pass. + if (self._transformer_use_recompute_grad and training): + # The seed is not actually used since we do not have any random + # operation in the recomputed function. The purpose of the provided + # seed is to prevent recompute_grad from generating a new seed + # variable which is not compatible with model exporting. + transformer_block_fn = recompute_grad_lib.recompute_grad( + transformer_block_fn_no_recompute, seed=tf.constant(0, tf.int32)) + else: + transformer_block_fn = transformer_block_fn_no_recompute + + transformer_block_fn_input_list = [ + features, memory_space_output, float_tensor_training] + # We have to define drop_path_masks outside the layer call and pass it + # into the layer, because recompute_grad (gradient checkpointing) does + # not allow any randomness within the function call. In addition, + # recompute_grad functions can only take Tensors as inputs, so we do not + # pass the drop_path_masks (when they are None) into + # transformer_block_fn. + if current_drop_path_keep_prob < 1.0 and training: + # Drop path random mask for pixel space attention. + pixel_space_drop_path_mask = drop_path.generate_drop_path_random_mask( + memory_space_output, current_drop_path_keep_prob) + # Drop path random mask for memory space attention. + memory_space_attention_drop_path_mask = ( + drop_path.generate_drop_path_random_mask( + memory_space_output, current_drop_path_keep_prob)) + # Drop path random mask for memory space feed-forward network. + memory_space_feed_forward_network_drop_path_mask = ( + drop_path.generate_drop_path_random_mask( + memory_space_output, current_drop_path_keep_prob)) + transformer_block_fn_input_list += [ + pixel_space_drop_path_mask, + memory_space_attention_drop_path_mask, + memory_space_feed_forward_network_drop_path_mask] + + # Build the sub-layers when the transformer_block_fn is called for the + # first time. Otherwise, recompute_grad will not track newly built model + # parameters. + if self._first_building_call: + _ = transformer_block_fn_no_recompute( + tuple(transformer_block_fn_input_list)) + # Apply a dual-path transformer. + features, activated_features, memory_space_output = ( + transformer_block_fn(tuple(transformer_block_fn_input_list))) + + # Reshape pixel space features back to 4D. + features = tf.reshape(features, [-1, height, width, channels]) + activated_features = tf.reshape(activated_features, + [-1, height, width, channels]) + # Now the first call has finished and the sub-layers have been built. + self._first_building_call = False + # We also return the non-activated output so that the function is compatible + # with a decoder that takes a non-activated tensor as input. + return features, activated_features, memory_space_output diff --git a/model/layers/axial_block_groups_test.py b/model/layers/axial_block_groups_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b1283bc2f2623035e5b8374ade1974db6d474141 --- /dev/null +++ b/model/layers/axial_block_groups_test.py @@ -0,0 +1,182 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for axial_block_groups.""" + +import numpy as np +import tensorflow as tf + +from deeplab2.model import test_utils +from deeplab2.model.layers import axial_block_groups + + +class AxialBlockGroupsTest(tf.test.TestCase): + + def test_axial_attention_follows_bottleneck_block(self): + layer = axial_block_groups.BlockGroup( + filters=512, + num_blocks=2, + name='block_group', + original_resnet_stride=2, + original_resnet_input_stride=16, + use_axial_beyond_stride=32, + output_stride=16) + _, pixel_output, memory_output = layer((tf.zeros([2, 65, 65, 1024]), + tf.zeros([2, 128, 147]))) + self.assertListEqual(pixel_output.get_shape().as_list(), + [2, 65, 65, 2048]) + self.assertListEqual(memory_output.get_shape().as_list(), + [2, 128, 147]) + + def test_global_attention_follows_basic_block(self): + layer = axial_block_groups.BlockGroup( + filters=256, + num_blocks=2, + name='block_group', + backbone_type='wider_resnet', + original_resnet_stride=2, + original_resnet_input_stride=8, + use_global_beyond_stride=16, + positional_encoding_type='1D') + + _, pixel_output, memory_output = layer((tf.zeros([2, 65, 65, 32]), + tf.zeros([2, 128, 147]))) + self.assertListEqual(pixel_output.get_shape().as_list(), + [2, 33, 33, 1024]) + self.assertListEqual(memory_output.get_shape().as_list(), + [2, 128, 147]) + + def test_atrous_consistency_basic_block(self): + tf.random.set_seed(0) + pixel_inputs = test_utils.create_test_input(2, 11, 11, 3) + # Dense feature extraction followed by subsampling. + layer1 = axial_block_groups.BlockGroup( + filters=2, + num_blocks=2, + name='stage3', + backbone_type='wider_resnet', + original_resnet_stride=2, + original_resnet_input_stride=8, + output_stride=8, + use_axial_beyond_stride=0, + use_global_beyond_stride=0, + use_transformer_beyond_stride=0) + # Create the weights + layer1((pixel_inputs, None)) + weights = layer1.get_weights() + # Set the batch norm gamma as non-zero so that the 3x3 convolution affects + # the output. + for index in range(len(weights)): + if np.sum(weights[index]) == 0.0: + weights[index] = weights[index] + 1 + layer1.set_weights(weights) + _, pixel_outputs, _ = layer1((pixel_inputs, None)) + output = pixel_outputs[:, ::2, ::2, :] + # Feature extraction at the nominal network rate. + layer2 = axial_block_groups.BlockGroup( + filters=2, + num_blocks=2, + name='stage3', + backbone_type='wider_resnet', + original_resnet_stride=2, + original_resnet_input_stride=8, + output_stride=16, + use_axial_beyond_stride=0, + use_global_beyond_stride=0, + use_transformer_beyond_stride=0) + # Create the weights + layer2((pixel_inputs, None)) + # Make the two networks use the same weights. + layer2.set_weights(layer1.get_weights()) + _, expected, _ = layer2((pixel_inputs, None)) + self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4) + + def test_atrous_consistency_bottleneck_block(self): + tf.random.set_seed(0) + pixel_inputs = test_utils.create_test_input(2, 11, 11, 3) + # Dense feature extraction followed by subsampling. + layer1 = axial_block_groups.BlockGroup( + filters=2, + num_blocks=2, + name='stage3', + backbone_type='wider_resnet', + original_resnet_stride=2, + original_resnet_input_stride=16, + output_stride=16, + use_axial_beyond_stride=0, + use_global_beyond_stride=0, + use_transformer_beyond_stride=0) + # Create the weights + layer1((pixel_inputs, None)) + weights = layer1.get_weights() + # Set the batch norm gamma as non-zero so that the 3x3 convolution affects + # the output. + for index in range(len(weights)): + if np.sum(weights[index]) == 0.0: + weights[index] = weights[index] + 1 + layer1.set_weights(weights) + _, pixel_outputs, _ = layer1((pixel_inputs, None)) + output = pixel_outputs[:, ::2, ::2, :] + # Feature extraction at the nominal network rate. + layer2 = axial_block_groups.BlockGroup( + filters=2, + num_blocks=2, + name='stage3', + backbone_type='wider_resnet', + original_resnet_stride=2, + original_resnet_input_stride=16, + output_stride=32, + use_axial_beyond_stride=0, + use_global_beyond_stride=0, + use_transformer_beyond_stride=0) + # Create the weights + layer2((pixel_inputs, None)) + # Make the two networks use the same weights. + layer2.set_weights(layer1.get_weights()) + _, expected, _ = layer2((pixel_inputs, None)) + self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4) + + def test_use_se_sac_recompute_drop_path_schedule(self): + _ = axial_block_groups.BlockGroup( + filters=512, + num_blocks=2, + name='block_group', + original_resnet_stride=2, + original_resnet_input_stride=8, + use_axial_beyond_stride=0, + use_squeeze_and_excite=True, # True + use_sac_beyond_stride=16, # True + recompute_within_stride=16, # True + drop_path_beyond_stride=16, + drop_path_schedule='linear', # 1.0, 0.85 + output_stride=16) + + def test_nouse_se_sac_recompute_drop_path_schedule(self): + _ = axial_block_groups.BlockGroup( + filters=512, + num_blocks=2, + name='block_group', + original_resnet_stride=2, + original_resnet_input_stride=8, + use_axial_beyond_stride=0, + use_squeeze_and_excite=False, # False + use_sac_beyond_stride=32, # False + recompute_within_stride=8, # False + drop_path_beyond_stride=32, # 1.0, 1.0 + drop_path_schedule='constant', + output_stride=16) + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/axial_blocks.py b/model/layers/axial_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..bb21189461979d87aa5a8294959053a5960dfe76 --- /dev/null +++ b/model/layers/axial_blocks.py @@ -0,0 +1,308 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implements Axial-Blocks proposed in Axial-DeepLab [1]. + +Axial-Blocks are based on residual bottleneck blocks, but with the 3x3 +convolution replaced with two axial-attention layers, one on the height-axis, +followed by the other on the width-axis. + +[1] Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. +""" +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import activations +from deeplab2.model.layers import axial_layers +from deeplab2.model.layers import convolutions +from deeplab2.model.layers import squeeze_and_excite + + +class AxialBlock(tf.keras.layers.Layer): + """An AxialBlock as a building block for an Axial-ResNet model. + + We implement the Axial-Block proposed in [1] in a general way that also + includes convolutional residual blocks, such as the basic block and the + bottleneck block (w/ and w/o Switchable Atrous Convolution). + + A basic block consists of two 3x3 convolutions and a residual connection. It + is the main building block for wide-resnet variants. + + A bottleneck block consists of consecutive 1x1, 3x3, 1x1 convolutions and a + residual connection. It is the main building block for standard resnet + variants. + + An axial block consists of a 1x1 input convolution, a self-attention layer + (either axial-attention or global attention), a 1x1 output convolution, and a + residual connection. It is the main building block for axial-resnet variants. + + Note: We apply the striding in the first spatial operation (i.e. 3x3 + convolution or self-attention layer). + """ + + def __init__(self, + filters_list, + kernel_size=3, + strides=1, + atrous_rate=1, + use_squeeze_and_excite=False, + use_sac=False, + bn_layer=tf.keras.layers.BatchNormalization, + activation='relu', + name=None, + conv_kernel_weight_decay=0.0, + basic_block_second_conv_atrous_rate=None, + attention_type=None, + axial_layer_config=None): + """Initializes an AxialBlock. + + Args: + filters_list: A list of filter numbers in the residual block. We currently + support filters_list with two or three elements. Two elements specify + the filters for two consecutive 3x3 convolutions, while three elements + specify the filters for three convolutions (1x1, 3x3, and 1x1). + kernel_size: The size of the convolution kernels (default: 3). + strides: The strides of the block (default: 1). + atrous_rate: The atrous rate of the 3x3 convolutions (default: 1). If this + residual block is a basic block, it is recommendeded to specify correct + basic_block_second_conv_atrous_rate for the second 3x3 convolution. + Otherwise, the second conv will also use atrous rate, which might cause + atrous inconsistency with different output strides, as tested in + axial_block_groups_test.test_atrous_consistency_basic_block. + use_squeeze_and_excite: A boolean flag indicating whether + squeeze-and-excite (SE) is used. + use_sac: A boolean, using the Switchable Atrous Convolution (SAC) or not. + bn_layer: A tf.keras.layers.Layer that computes the normalization + (default: tf.keras.layers.BatchNormalization). + activation: A string specifying the activation function to apply. + name: An string specifying the name of the layer (default: None). + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + basic_block_second_conv_atrous_rate: An integer, the atrous rate for the + second convolution of basic block. This is necessary to ensure atrous + consistency with different output_strides. Defaults to atrous_rate. + attention_type: A string, type of attention to apply. Support 'axial' and + 'global'. + axial_layer_config: A dict, an argument dictionary for the axial layer. + + Raises: + ValueError: If filters_list does not have two or three elements. + ValueError: If attention_type is not supported. + ValueError: If double_global_attention is True in axial_layer_config. + """ + super(AxialBlock, self).__init__(name=name) + + self._filters_list = filters_list + self._strides = strides + self._use_squeeze_and_excite = use_squeeze_and_excite + self._bn_layer = bn_layer + self._activate_fn = activations.get_activation(activation) + self._attention_type = attention_type + + if axial_layer_config is None: + axial_layer_config = {} + + if basic_block_second_conv_atrous_rate is None: + basic_block_second_conv_atrous_rate = atrous_rate + + if len(filters_list) == 3: + # Three consecutive convolutions: 1x1, 3x3, and 1x1. + self._conv1_bn_act = convolutions.Conv2DSame( + filters_list[0], 1, 'conv1_bn_act', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=activation, + conv_kernel_weight_decay=conv_kernel_weight_decay) + + if attention_type is None or attention_type.lower() == 'none': + self._conv2_bn_act = convolutions.Conv2DSame( + filters_list[1], kernel_size, 'conv2_bn_act', + strides=strides, + atrous_rate=atrous_rate, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=activation, + use_switchable_atrous_conv=use_sac, + # We default to use global context in SAC if use_sac is True. This + # setting is experimentally found effective. + use_global_context_in_sac=use_sac, + conv_kernel_weight_decay=conv_kernel_weight_decay) + elif attention_type == 'axial': + if 'double_global_attention' in axial_layer_config: + if axial_layer_config['double_global_attention']: + raise ValueError('Double_global_attention takes no effect in ' + 'AxialAttention2D.') + del axial_layer_config['double_global_attention'] + self._attention = axial_layers.AxialAttention2D( + strides=strides, + filters=filters_list[1], + name='attention', + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay, + **axial_layer_config) + elif attention_type == 'global': + self._attention = axial_layers.GlobalAttention2D( + strides=strides, + filters=filters_list[1], + name='attention', + bn_layer=bn_layer, + conv_kernel_weight_decay=conv_kernel_weight_decay, + **axial_layer_config) + else: + raise ValueError(attention_type + ' is not supported.') + + # Here we apply a batch norm with gamma initialized at zero. This ensures + # that at random initialization of the model, the skip connections + # dominate all residual blocks. In this way, all the skip connections + # construct an identity mapping that passes the gradients (without any + # distortion from the randomly initialized blocks) to all residual blocks. + # This trick helps training at early epochs. + # Reference: "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour". + # https://arxiv.org/abs/1706.02677 + self._conv3_bn = convolutions.Conv2DSame( + filters_list[2], 1, 'conv3_bn', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + bn_gamma_initializer='zeros', + activation='none', + conv_kernel_weight_decay=conv_kernel_weight_decay) + elif len(filters_list) == 2: + # Two consecutive convolutions: 3x3 and 3x3. + self._conv1_bn_act = convolutions.Conv2DSame( + filters_list[0], kernel_size, 'conv1_bn_act', + strides=strides, + atrous_rate=atrous_rate, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=activation, + use_switchable_atrous_conv=use_sac, + use_global_context_in_sac=use_sac, + conv_kernel_weight_decay=conv_kernel_weight_decay) + # Here we apply a batch norm with gamma initialized at zero. This ensures + # that at random initialization of the model, the skip connections + # dominate all residual blocks. In this way, all the skip connections + # construct an identity mapping that passes the gradients (without any + # distortion from the randomly initialized blocks) to all residual blocks. + # This trick helps training at early epochs. + # Reference: "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour". + # https://arxiv.org/abs/1706.02677 + self._conv2_bn = convolutions.Conv2DSame( + filters_list[1], kernel_size, 'conv2_bn', + strides=1, + atrous_rate=basic_block_second_conv_atrous_rate, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + bn_gamma_initializer='zeros', + activation='none', + use_switchable_atrous_conv=use_sac, + use_global_context_in_sac=use_sac, + conv_kernel_weight_decay=conv_kernel_weight_decay) + else: + raise ValueError('Expect filters_list to have length 2 or 3; got %d' % + len(filters_list)) + + if self._use_squeeze_and_excite: + self._squeeze_and_excite = squeeze_and_excite.SimplifiedSqueezeAndExcite( + filters_list[-1]) + self._conv_kernel_weight_decay = conv_kernel_weight_decay + + def build(self, input_shape_list): + input_tensor_shape = input_shape_list[0] + self._shortcut = None + if input_tensor_shape[3] != self._filters_list[-1]: + self._shortcut = convolutions.Conv2DSame( + self._filters_list[-1], 1, 'shortcut', + strides=self._strides, + use_bias=False, + use_bn=True, + bn_layer=self._bn_layer, + activation='none', + conv_kernel_weight_decay=self._conv_kernel_weight_decay) + + def call(self, inputs): + """Performs a forward pass. + + We have to define drop_path_random_mask outside the layer call and pass it + into the layer, because recompute_grad (gradient checkpointing) does not + allow any randomness within the function call. In addition, recompute_grad + only supports float tensors as inputs. For this reason, the training flag + should be also passed as a float tensor. For the same reason, we cannot + support passing drop_path_random_mask as None. Instead, we ask the users to + pass only the first two tensors when drop path is not used. + + Args: + inputs: A tuple of 2 or 3 tensors, containing + input_tensor should be an input tensor of type tf.Tensor with shape + [batch, height, width, channels]. + float_tensor_training should be a float tensor of 0.0 or 1.0, whether + the model is in training mode. + (optional) drop_path_random_mask is a drop path random mask of type + tf.Tensor with shape [batch, 1, 1, 1]. + + Returns: + outputs: two tensors. The first tensor does not use the last activation + function. The second tensor uses the activation. We return non-activated + output to support MaX-DeepLab which uses non-activated feature for the + stacked decoders. + + Raises: + ValueError: If the length of inputs is not 2 or 3. + """ + if len(inputs) not in (2, 3): + raise ValueError('The length of inputs should be either 2 or 3.') + + # Unpack the inputs. + input_tensor, float_tensor_training, drop_path_random_mask = ( + utils.pad_sequence_with_none(inputs, target_length=3)) + + # Recompute_grad takes only float tensors as inputs. It does not allow + # bools or boolean tensors. For this reason, we cast training to a float + # tensor outside this call, and now we cast it back to a boolean tensor. + training = tf.cast(float_tensor_training, tf.bool) + + shortcut = input_tensor + if self._shortcut is not None: + shortcut = self._shortcut(shortcut, training=training) + elif self._strides != 1: + shortcut = shortcut[:, ::self._strides, ::self._strides, :] + + if len(self._filters_list) == 3: + x = self._conv1_bn_act(input_tensor, training=training) + if (self._attention_type is None or + self._attention_type.lower() == 'none'): + x = self._conv2_bn_act(x, training=training) + else: + x = self._attention(x, training=training) + x = self._activate_fn(x) + x = self._conv3_bn(x, training=training) + if len(self._filters_list) == 2: + x = self._conv1_bn_act(input_tensor, training=training) + x = self._conv2_bn(x, training=training) + + if self._use_squeeze_and_excite: + x = self._squeeze_and_excite(x) + + if drop_path_random_mask is not None: + x = x * drop_path_random_mask + x = x + shortcut + return x, self._activate_fn(x) diff --git a/model/layers/axial_blocks_test.py b/model/layers/axial_blocks_test.py new file mode 100644 index 0000000000000000000000000000000000000000..3dad90a38b9587358d898e63fa5d47796e17b1fc --- /dev/null +++ b/model/layers/axial_blocks_test.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for axial_blocks.""" + +import tensorflow as tf + +from deeplab2.model.layers import axial_blocks + + +class AxialBlocksTest(tf.test.TestCase): + + def test_conv_basic_block_correct_output_shape(self): + layer = axial_blocks.AxialBlock( + filters_list=[256, 256], + strides=2) + float_training_tensor = tf.constant(0.0, dtype=tf.float32) + output = layer((tf.zeros([2, 65, 65, 32]), + float_training_tensor))[1] + self.assertListEqual(output.get_shape().as_list(), [2, 33, 33, 256]) + + def test_conv_bottleneck_block_correct_output_shape(self): + layer = axial_blocks.AxialBlock( + filters_list=[64, 64, 256], + strides=1) + float_training_tensor = tf.constant(0.0, dtype=tf.float32) + output = layer((tf.zeros([2, 65, 65, 32]), + float_training_tensor))[0] + self.assertListEqual(output.get_shape().as_list(), [2, 65, 65, 256]) + + def test_axial_block_correct_output_shape(self): + layer = axial_blocks.AxialBlock( + filters_list=[128, 64, 256], + strides=2, + attention_type='axial') + float_training_tensor = tf.constant(0.0, dtype=tf.float32) + output = layer((tf.zeros([2, 65, 65, 32]), + float_training_tensor))[1] + self.assertListEqual(output.get_shape().as_list(), [2, 33, 33, 256]) + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/axial_layers.py b/model/layers/axial_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..48e2f8651c1f3ea1b8eeafc987ffbf6bae753161 --- /dev/null +++ b/model/layers/axial_layers.py @@ -0,0 +1,523 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implements Axial-Attention layers proposed in Axial-DeepLab. + +Axial-Attention factorizes 2D self-attention into two 1D self-attentions, so +that it can be applied on large inputs. Axial-Attention is typically used to +replace 3x3 convolutions in a bottleneck residual block. + +[1] Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. +""" + +import numpy as np +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import activations +from deeplab2.model.layers import positional_encodings + + +class AxialAttention(tf.keras.layers.Layer): + """An axial-attention layer.""" + + def __init__(self, + query_shape=129, + memory_flange=32, + total_key_depth=512, + total_value_depth=1024, + num_heads=8, + name='axial_attention', + use_query_rpe_similarity=True, + use_key_rpe_similarity=True, + use_content_similarity=True, + retrieve_value_rpe=True, + retrieve_value_content=True, + initialization_std_for_query_key_rpe=1.0, + initialization_std_for_value_rpe=1.0, + self_attention_activation='softmax', + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0): + """Initializes an axial-attention layer. + + This function is designed to support both global and local axial-attention + in a unified way. If query_shape is larger than the length of input, a + global attention is applied. If query_shape is smaller than the length of + input, a local attention is applied. In this case, the input is divided into + blocks of length query_shape, padded by memory_flange on both sides. Then, + local attention is applied within each query block. The choice of + query_shape does not affect the output value but affects computation + efficiency and memory usage. In general, use global attention (large + query_shape) if possible. Local axial-attention has not been supported yet. + + Args: + query_shape: An integer, the block size for local axial attention. + Defaults to 129 since 129 is usually the largest feature map where we do + global attention (1025 with stride 8, or 2049 with stride 16). + memory_flange: An integer, the memory flange padded to each query block in + local attention. It has no effect in global attention. Defaults to 32, + which is equivalent to a span of 65 in Aixal-DeepLab paper -- A pixel + can see 32 pixels on its left and 32 pixels on its right. + total_key_depth: An integer, the total depth of keys, which is also the + depth of queries and the depth of key (query) positional encodings. + total_value_depth: An integer, the total depth of the values, which is + also the depth of value positional encodings. + num_heads: An integer, the number of heads in multi-head attention. + name: A string, the name of this axial attention layer. + use_query_rpe_similarity: A boolean, whether to use the attention + similarity between the queries and the relative positional encodings. + use_key_rpe_similarity: A boolean, whether to use the attention similarity + between the keys and the relative positional encodings. + use_content_similarity: A boolean, whether to use the content similarity + between the queries and the keys. + retrieve_value_rpe: A boolean, whether to retrieve the relative positional + encodings of the values. + retrieve_value_content: A boolean, whether to retrieve the content of the + values. + initialization_std_for_query_key_rpe: A float, the initialization std for + the relative positional encodings of the queries and keys. + initialization_std_for_value_rpe: A float, the initialization std for the + relative positional encodings of the values. + self_attention_activation: A string, type of activation function for + self-attention. Support 'sigmoid' and 'softmax'. + bn_layer: A tf.keras.layers.Layer that computes the normalization + (default: tf.keras.layers.BatchNormalization). + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + + Returns: + output: A [batch, length, total_value_depth] tensor. + + Raises: + ValueError: If none of the three similarities (use_query_rpe_similarity, + use_key_rpe_similarity, use_content_similarity) is used. + ValueError: If neither of value content or value rpe is retrieved. + ValueError: If self_attention_activation is not supported. + ValueError: If total_key_depth is not divisible by num_heads. + ValueError: If total_value_depth is not divisible by num_heads. + """ + # Validate the attention similarity choices. + if not any([ + use_content_similarity, use_key_rpe_similarity, use_query_rpe_similarity + ]): + raise ValueError( + 'Should use at least one similarity to compute attention.') + + # Validate the retrieve value choices. + if not retrieve_value_content and not retrieve_value_rpe: + raise ValueError('Should retrieve at least one of content or rpe.') + + if total_key_depth % num_heads: + raise ValueError('Total_key_depth should be divisible by num_heads.') + + if total_value_depth % num_heads: + raise ValueError('Total_value_depth should be divisible by num_heads.') + + super(AxialAttention, self).__init__(name=name) + self._query_shape = query_shape + self._memory_flange = memory_flange + self._total_key_depth = total_key_depth + self._total_value_depth = total_value_depth + self._num_heads = num_heads + self._use_query_rpe_similarity = use_query_rpe_similarity + self._use_key_rpe_similarity = use_key_rpe_similarity + self._use_content_similarity = use_content_similarity + self._retrieve_value_rpe = retrieve_value_rpe + self._retrieve_value_content = retrieve_value_content + self._initialization_std_for_query_key_rpe = ( + initialization_std_for_query_key_rpe) + self._initialization_std_for_value_rpe = initialization_std_for_value_rpe + self._self_attention_activation = self_attention_activation + self._conv_kernel_weight_decay = conv_kernel_weight_decay + + self._batch_norm_qkv = bn_layer(axis=-1, name='batch_norm_qkv') + self._batch_norm_similarity = bn_layer( + axis=[0, 2], name='batch_norm_similarity') + self._batch_norm_retrieved_output = bn_layer( + axis=[0, 2, 4], name='batch_norm_retrieved_output') + + self._key_depth_per_head = total_key_depth // num_heads + self._attention_activate_fn = activations.get_activation( + self_attention_activation) + + def build(self, input_shape): + """Builds axial-attention layer weights. + + Args: + input_shape: An integer list of length 3, the shape of the input tensor. + + Raises: + NotImplementedError: Local axial-attention has not been implemented. It is + triggered if query_shape is less than input_shape. + """ + + # Apply global attention if query_shape is larger than input_shape[1]. + if self._query_shape >= input_shape[1]: + self._query_shape = input_shape[1] + self._memory_flange = 0 + else: + raise NotImplementedError('Local axial attention has not been ' + 'implemented yet.') + self._memory_shape = self._query_shape + 2 * self._memory_flange + + # Compute query key value with one convolution and an optional batch norm. + # The initialization std is standard transformer initialization (without + # batch norm), as used in SASA and ViT. In our case, we use batch norm by + # default, so it does not require careful tuning. If one wants to remove + # all batch norms in axial attention, this standard initialization should + # still be good, but a more careful initialization is encouraged. + self.qkv_kernel = self.add_weight( + name='qkv_kernel', + shape=[input_shape[-1], + self._total_key_depth * 2 + self._total_value_depth], + initializer=tf.keras.initializers.TruncatedNormal( + stddev=input_shape[-1]**-0.5), + regularizer=tf.keras.regularizers.l2(self._conv_kernel_weight_decay)) + + if self._use_query_rpe_similarity: + self._query_rpe = positional_encodings.RelativePositionalEncoding( + self._query_shape, + self._memory_shape, + self._key_depth_per_head, + self._num_heads, + 'query_rpe', + initialization_std=self._initialization_std_for_query_key_rpe, + conv_kernel_weight_decay=self._conv_kernel_weight_decay) + + if self._use_key_rpe_similarity: + self._key_rpe = positional_encodings.RelativePositionalEncoding( + self._query_shape, + self._memory_shape, + self._key_depth_per_head, + self._num_heads, + 'key_rpe', + initialization_std=self._initialization_std_for_query_key_rpe, + conv_kernel_weight_decay=self._conv_kernel_weight_decay) + + if self._retrieve_value_rpe: + self._value_rpe = positional_encodings.RelativePositionalEncoding( + self._query_shape, + self._memory_shape, + self._total_value_depth // self._num_heads, + self._num_heads, + 'value_rpe', + initialization_std=self._initialization_std_for_value_rpe, + conv_kernel_weight_decay=self._conv_kernel_weight_decay) + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input [batch, length, channel] tensor. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + output: An output [batch, length, total_value_depth] tensor. + """ + # Alternatively, the einsum can be implemented as a 1x1 convolution. + # However, it is not obvious which implementation is more efficient (without + # careful benchmarking), so we use einsum for its flexibility and + # consistency with other parts of the function. + query_key_value = tf.einsum( + 'nlc,cd->nld', input_tensor, self.qkv_kernel, name='compute_qkv') + query_key_value = self._batch_norm_qkv(query_key_value, training=training) + + # Split query key value. + query, key, value = tf.split( + query_key_value, + [self._total_key_depth, self._total_key_depth, self._total_value_depth], + axis=-1) + + # Reshape the query, key, and value. + query = tf.reshape(query, [-1, self._query_shape, self._num_heads, + self._key_depth_per_head]) + query = tf.transpose(a=query, perm=[0, 2, 1, 3]) + key = tf.reshape(key, [-1, np.prod(self._memory_shape), self._num_heads, + self._key_depth_per_head]) + key = tf.transpose(a=key, perm=[0, 2, 1, 3]) + value = tf.reshape(value, [-1, np.prod(self._memory_shape), self._num_heads, + self._total_value_depth // self._num_heads]) + + # Gather all similarity logits into a list. + similarity_logits = [] + + # Compute the content similarity term: q * k. + if self._use_content_similarity: + content_similarity = tf.einsum( + 'bhld,bhmd->bhlm', query, key, name='content_similarity') + similarity_logits.append(content_similarity) + + # Compute the query rpe similarity term: q * rpe. + if self._use_query_rpe_similarity: + query_rpe = self._query_rpe(None) + query_rpe_similarity = tf.einsum( + 'bhld,hlmd->bhlm', query, query_rpe, name='query_rpe_similarity') + similarity_logits.append(query_rpe_similarity) + + # Compute the key rpe similarity term: k * rpe. + if self._use_key_rpe_similarity: + key_rpe = self._key_rpe(None) + key_rpe_similarity = tf.einsum( + 'bhmd,hlmd->bhlm', key, key_rpe, name='key_rpe_similarity') + similarity_logits.append(key_rpe_similarity) + + # Apply an optional batch norm to the similarities and sum them. + similarity_logits = tf.stack(similarity_logits) + similarity_logits = self._batch_norm_similarity(similarity_logits, + training=training) + similarity_logits = tf.reduce_sum(input_tensor=similarity_logits, axis=0) + + # Apply an attention activation function, e.g. softmax. + weights = self._attention_activate_fn(similarity_logits) + + # Gather retrieved values or rpes into a list. + retrieve_list = [] + + # Retrieve the content of the attended value. + if self._retrieve_value_content: + retrieved_content = tf.einsum( + 'bhlm,bmhd->bhld', weights, value, name='retrieve_value_content') + retrieve_list.append(retrieved_content) + + # Retrieve the relative position of the attended value. + if self._retrieve_value_rpe: + value_rpe = self._value_rpe(None) + retrieved_rpe = tf.einsum( + 'bhlm,hlmd->bhld', weights, value_rpe, name='retrieve_value_rpe') + retrieve_list.append(retrieved_rpe) + + # Apply batch norms to retrieved contents and rpes respectively. + retrieved_output = tf.stack(retrieve_list) + retrieved_output = self._batch_norm_retrieved_output(retrieved_output, + training=training) + # Additive contents and rpes. + retrieved_output = tf.reduce_sum(input_tensor=retrieved_output, axis=0) + + # Combine the heads by transposing and reshaping the tensor. + retrieved_output = utils.transpose_and_reshape_for_attention_operation( + retrieved_output) + + return retrieved_output + + +class AxialAttention2D(tf.keras.layers.Layer): + """Sequentially applies height-axis and width-axis axial-attention.""" + + def __init__(self, + strides=1, + filters=512, + name='attention', + key_expansion=1, + value_expansion=2, + query_shape=(129, 129), + memory_flange=(32, 32), + **kwargs): + """Initializes an AxialAttention2D layer. + + Args: + strides: An integer, the stride for the output, usually 1 or 2. + filters: An integer, the base number of channels for the layer. + name: A string, the name of the attention layer. + key_expansion: A float, the channel expansion ratio for keys. + value_expansion: A float, the channel expansion ratio for values. + query_shape: An integer, the maximum query shape for both the height axis + and the width axis. + memory_flange: An integer list of length 2. The memory flange for the + height axis and the width axis. + **kwargs: A dictionary of keyword arguments passed to height-axis, + width-axis, and 2D global AxialAttention. + + Returns: + output: A [batch, strided height, strided width, output_channels] tensor. + """ + super(AxialAttention2D, self).__init__(name=name) + total_key_depth = int(round(filters * key_expansion)) + total_value_depth = int(round(filters * value_expansion)) + self._strides = strides + self._total_key_depth = total_key_depth + self._total_value_depth = total_value_depth + self._height_axis = AxialAttention( + total_key_depth=total_key_depth, + total_value_depth=total_value_depth, + query_shape=query_shape[0], + memory_flange=memory_flange[0], + name='height_axis', + **kwargs) + self._width_axis = AxialAttention( + total_key_depth=total_key_depth, + total_value_depth=total_value_depth, + query_shape=query_shape[1], + memory_flange=memory_flange[1], + name='width_axis', + **kwargs) + + def call(self, inputs, training=False): + """Performs a forward pass. + + Args: + inputs: An input [batch, height, width, channel] tensor. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + output: An output [batch, strided_height, strided_width, + filters * value_expansion] tensor. + """ + _, height, width, channel = inputs.get_shape().as_list() + + # Transpose and reshape the width axis to the batch dimension. + x = tf.transpose(a=inputs, perm=[0, 2, 1, 3]) + x = tf.reshape(x, [-1, height, channel]) + x = self._height_axis(x, training=training) + # Reshape and transpose back to a 4D tensor. + x = tf.reshape(x, [-1, width, height, self._total_value_depth]) + x = tf.transpose(a=x, perm=[0, 2, 1, 3]) + # Height axis striding. + if self._strides > 1: + x = x[:, ::self._strides, :, :] + + # Reshape the height axis to the batch dimension. + _, strided_height, _, _ = x.get_shape().as_list() + x = tf.reshape(x, [-1, width, self._total_value_depth]) + x = self._width_axis(x, training=training) + # Reshape back to a 4D tensor. + x = tf.reshape(x, [-1, strided_height, width, self._total_value_depth]) + # Width axis striding. + if self._strides > 1: + x = x[:, :, ::self._strides, :] + + return x + + +class GlobalAttention2D(tf.keras.layers.Layer): + """A 2D global attention layer.""" + + def __init__(self, + strides=1, + filters=512, + name='attention', + key_expansion=1, + value_expansion=2, + query_shape=(129, 129), + memory_flange=(32, 32), + double_global_attention=False, + **kwargs): + """Initializes a GlobalAttention2D layer. + + Args: + strides: An integer, the stride for the output, usually 1 or 2. + filters: An integer, the base number of channels for the layer. + name: A string, the name of the attention layer. + key_expansion: A float, the channel expansion ratio for keys. + value_expansion: A float, the channel expansion ratio for values. + query_shape: An integer, the maximum query shape for both the height axis + and the width axis. + memory_flange: An integer list of length 2. The memory flange for the + height axis and the width axis. + double_global_attention: A boolean, whether to use two global attention + layers. Two global attention layers match the parameter count to a + seqentially applied height and width axial attention layer. + **kwargs: A dictionary of keyword arguments passed to height-axis, + width-axis, and 2D global AxialAttention. + + Returns: + output: A [batch, strided height, strided width, output_channels] tensor. + + Raises: + ValueError: If relative positional encoding is enforced in kwargs. + """ + if any([kwargs.get('use_query_rpe_similarity', False), + kwargs.get('use_key_rpe_similarity', False), + kwargs.get('retrieve_value_rpe', False)]): + raise ValueError('GlobalAttention2D does not support relative positional ' + 'encodings.') + + super(GlobalAttention2D, self).__init__(name=name) + total_key_depth = int(round(filters * key_expansion)) + total_value_depth = int(round(filters * value_expansion)) + self._strides = strides + self._double_global_attention = double_global_attention + self._total_key_depth = total_key_depth + self._total_value_depth = total_value_depth + + # Global attention does not support relative positional encodings. + kwargs['use_query_rpe_similarity'] = False + kwargs['use_key_rpe_similarity'] = False + kwargs['retrieve_value_rpe'] = False + self._kwargs = kwargs + + def build(self, input_shape): + """Builds global attention layers according to the 4D input_shape.""" + _, height, width, _ = input_shape + # Implement 2D global attention as 1D axial-attention by flattening the 2D + # inputs into 1D. We also disable the relative positional encodings in + # axial attention, so that only content-based attention is used. The query + # shape is set to height * width, so that the axial attention is global. + self._global = AxialAttention( + total_key_depth=self._total_key_depth, + total_value_depth=self._total_value_depth, + query_shape=height*width, + memory_flange=0, + name='global', + **self._kwargs) + + # Use two global attention layers in one residual block. This option + # ensures that global attention models have similar number of layers and + # parameters as axial-attention models. + if self._double_global_attention: + self._global2 = AxialAttention( + total_key_depth=self._total_key_depth, + total_value_depth=self._total_value_depth, + query_shape=height*width, + memory_flange=0, + name='global2', + **self._kwargs) + + def call(self, inputs, training=False): + """Performs a forward pass. + + Args: + inputs: An input [batch, height, width, channel] tensor. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + output: An output [batch, strided_height, strided_width, + filters * value_expansion] tensor. + """ + _, height, width, channel = inputs.get_shape().as_list() + + # Reshape the inputs so that the attention is global 2D. + x = tf.reshape(inputs, [-1, height * width, channel]) + + # Implement 2D global attention as 1D axial-attention by flattening the 2D + # inputs into 1D. We also disable the relative positional encodings in + # axial attention, so that only content-based attention is used. + x = self._global(x, training=training) + + # Use two global attention layers in one residual block. This option + # ensures that global attention models have the same number of layers and + # parameters as axial-attention models. + if self._double_global_attention: + x = self._global2(x, training=training) + x = tf.reshape(x, [-1, height, width, self._total_value_depth]) + if self._strides > 1: + x = x[:, ::self._strides, ::self._strides, :] + + return x diff --git a/model/layers/axial_layers_test.py b/model/layers/axial_layers_test.py new file mode 100644 index 0000000000000000000000000000000000000000..2fb8accdd470e25ec3ad896ba16bff2739a0dbbc --- /dev/null +++ b/model/layers/axial_layers_test.py @@ -0,0 +1,56 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for axial_layers.""" + +import tensorflow as tf + +from deeplab2.model.layers import axial_layers + + +class AxialLayersTest(tf.test.TestCase): + + def test_default_axial_attention_layer_output_shape(self): + layer = axial_layers.AxialAttention() + output = layer(tf.zeros([10, 5, 32])) + self.assertListEqual(output.get_shape().as_list(), [10, 5, 1024]) + + def test_axial_attention_2d_layer_output_shape(self): + layer = axial_layers.AxialAttention2D() + output = layer(tf.zeros([2, 5, 5, 32])) + self.assertListEqual(output.get_shape().as_list(), [2, 5, 5, 1024]) + + def test_change_filters_output_shape(self): + layer = axial_layers.AxialAttention2D(filters=32) + output = layer(tf.zeros([2, 5, 5, 32])) + self.assertListEqual(output.get_shape().as_list(), [2, 5, 5, 64]) + + def test_value_expansion_output_shape(self): + layer = axial_layers.AxialAttention2D(value_expansion=1) + output = layer(tf.zeros([2, 5, 5, 32])) + self.assertListEqual(output.get_shape().as_list(), [2, 5, 5, 512]) + + def test_global_attention_output_shape(self): + layer = axial_layers.GlobalAttention2D() + output = layer(tf.zeros([2, 5, 5, 32])) + self.assertListEqual(output.get_shape().as_list(), [2, 5, 5, 1024]) + + def test_stride_two_output_shape(self): + layer = axial_layers.AxialAttention2D(strides=2) + output = layer(tf.zeros([2, 5, 5, 32])) + self.assertListEqual(output.get_shape().as_list(), [2, 3, 3, 1024]) + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/blocks.py b/model/layers/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..3e46651aeaacf1e416ffa19b43de433f2031cc31 --- /dev/null +++ b/model/layers/blocks.py @@ -0,0 +1,211 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implements building blocks for neural networks.""" +from typing import Optional + +from absl import logging + +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import convolutions +from deeplab2.model.layers import squeeze_and_excite + +backend = tf.keras.backend +layers = tf.keras.layers + + +class InvertedBottleneckBlock(tf.keras.layers.Layer): + """An inverted bottleneck block. + + Reference: + Sandler, M., Howard, A., et al. Mobilenetv2: Inverted residuals and linear + bottlenecks. In CVPR, 2018 + Howard, A., Sandler, M., et al. Searching for mobilenetv3. In ICCV, 2019 + """ + + def __init__(self, + in_filters: int, + out_filters: int, + expand_ratio: int, + strides: int, + kernel_size: int = 3, + se_ratio: Optional[float] = None, + activation: str = 'relu', + se_inner_activation: str = 'relu', + se_gating_activation: str = 'sigmoid', + depthwise_activation: Optional[str] = None, + expand_se_in_filters: bool = False, + atrous_rate: int = 1, + divisible_by: int = 1, + bn_layer: layers.Layer = tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay: float = 0.0, + regularize_depthwise: bool = False, + use_depthwise: bool = True, + use_residual: bool = True, + name: Optional[str] = None): + """Initializes an inverted bottleneck block with BN after convolutions. + + Args: + in_filters: The number of filters of the input tensor. + out_filters: The number of filters of the output tensor. + expand_ratio: The expand_ratio for an inverted bottleneck block. If + expand_ratio is <= 1, this argument will be ignored. + strides: The number of stride. If greater than 1, this block will + ultimately downsample the input. + kernel_size: The kernel size of the depthwise conv layer. + se_ratio: If not None, se ratio for the squeeze and excitation layer. + activation: The name of the activation function. + se_inner_activation: The name of squeeze-excitation inner activation. + se_gating_activation: The name of squeeze-excitation gating activation. + depthwise_activation: The name of the activation function for depthwise + only. + expand_se_in_filters: Whether or not to expand in_filter in squeeze and + excitation layer. + atrous_rate: The atrous dilation rate to use for. + divisible_by: A number that all inner dimensions are divisible by. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + conv_kernel_weight_decay: The weight decay for convolution kernels. + regularize_depthwise: Whether or not apply regularization on depthwise. + use_depthwise: Whether to uses standard convolutions instead of depthwise. + use_residual: Whether to include residual connection between input and + output. + name: Name for the block. + """ + super(InvertedBottleneckBlock, self).__init__(name=name) + + self._in_filters = in_filters + self._out_filters = out_filters + self._expand_ratio = expand_ratio + self._strides = strides + self._kernel_size = kernel_size + self._se_ratio = se_ratio + self._divisible_by = divisible_by + self._atrous_rate = atrous_rate + self._regularize_depthwise = regularize_depthwise + self._use_depthwise = use_depthwise + self._use_residual = use_residual + self._activation = activation + self._se_inner_activation = se_inner_activation + self._se_gating_activation = se_gating_activation + self._depthwise_activation = depthwise_activation + self._expand_se_in_filters = expand_se_in_filters + + if tf.keras.backend.image_data_format() == 'channels_last': + self._bn_axis = -1 + else: + self._bn_axis = 1 + + if depthwise_activation is None: + self._depthwise_activation = activation + + if regularize_depthwise: + depthwise_kernel_weight_decay = conv_kernel_weight_decay + else: + depthwise_kernel_weight_decay = 0.0 + + if self._expand_ratio <= 1 and not self._use_depthwise: + raise ValueError( + 'Undefined behavior if expand_ratio <= 1 and not use_depthwise') + + expand_filters = self._in_filters + if self._expand_ratio > 1: + # First 1x1 conv for channel expansion. + expand_filters = utils.make_divisible( + self._in_filters * self._expand_ratio, self._divisible_by) + + expand_kernel = 1 if self._use_depthwise else self._kernel_size + expand_stride = 1 if self._use_depthwise else self._strides + + self._conv1_bn_act = convolutions.Conv2DSame( + output_channels=expand_filters, + kernel_size=expand_kernel, + strides=expand_stride, + atrous_rate=1, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=self._activation, + conv_kernel_weight_decay=conv_kernel_weight_decay, + name='expand_conv') + + if self._use_depthwise: + # Depthwise conv. + self._conv2_bn_act = convolutions.DepthwiseConv2DSame( + kernel_size=self._kernel_size, + strides=self._strides, + atrous_rate=self._atrous_rate, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=self._depthwise_activation, + name='depthwise_conv') + + # Squeeze and excitation. + if self._se_ratio is not None and self._se_ratio > 0: + if self._expand_se_in_filters: + in_filters = expand_filters + else: + in_filters = self._in_filters + self._squeeze_excitation = squeeze_and_excite.SqueezeAndExcite( + in_filters=in_filters, + out_filters=expand_filters, + se_ratio=self._se_ratio, + divisible_by=self._divisible_by, + kernel_initializer='he_normal', + kernel_regularizer=tf.keras.regularizers.l2(conv_kernel_weight_decay), + activation=self._se_inner_activation, + gating_activation=self._se_gating_activation, + name=name + '_se') + else: + logging.info( + 'Squeeze and Excitation is skipped due to undefined se_ratio') + self._squeeze_excitation = None + + # Last 1x1 conv. + self._conv3_bn = convolutions.Conv2DSame( + output_channels=self._out_filters, + kernel_size=1, + strides=1, + atrous_rate=1, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=None, + conv_kernel_weight_decay=conv_kernel_weight_decay, + name='project_conv') + + def call(self, inputs, training=None): + shortcut = inputs + if self._expand_ratio > 1: + x = self._conv1_bn_act(inputs, training=training) + else: + x = inputs + + if self._use_depthwise: + x = self._conv2_bn_act(x, training=training) + + if self._squeeze_excitation is not None: + x = self._squeeze_excitation(x) + + x = self._conv3_bn(x, training=training) + + if (self._use_residual and + self._in_filters == self._out_filters): + x = tf.add(x, shortcut) + + return x diff --git a/model/layers/blocks_test.py b/model/layers/blocks_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0be9e6365d2c0b80cfeb3e78453d64b5f7eaac64 --- /dev/null +++ b/model/layers/blocks_test.py @@ -0,0 +1,72 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for blocks.py.""" +import tensorflow as tf + +from deeplab2.model.layers import blocks + + +class BlocksTest(tf.test.TestCase): + + def test_inverted_bottleneck_block_output_shape(self): + batch, height, width, input_channels = 2, 17, 17, 4 + output_channels = 6 + input_tensor = tf.random.uniform( + shape=(batch, height, width, input_channels)) + ivb_block = blocks.InvertedBottleneckBlock( + in_filters=input_channels, + out_filters=output_channels, + expand_ratio=2, + strides=1, + name='inverted_bottleneck', + ) + output_tensor = ivb_block(input_tensor) + self.assertListEqual(output_tensor.get_shape().as_list(), + [batch, height, width, output_channels]) + + def test_inverted_bottleneck_block_feature_map_alignment(self): + batch, height, width, input_channels = 2, 17, 17, 128 + output_channels = 256 + input_tensor = tf.random.uniform( + shape=(batch, height, width, input_channels)) + ivb_block1 = blocks.InvertedBottleneckBlock( + in_filters=input_channels, + out_filters=output_channels, + expand_ratio=2, + strides=2, + name='inverted_bottleneck1', + ) + ivb_block1(input_tensor, False) + weights = ivb_block1.get_weights() + output_tensor = ivb_block1(input_tensor, False) + + ivb_block2 = blocks.InvertedBottleneckBlock( + in_filters=input_channels, + out_filters=output_channels, + expand_ratio=2, + strides=1, + name='inverted_bottleneck2', + ) + ivb_block2(input_tensor, False) + ivb_block2.set_weights(weights) + expected = ivb_block2(input_tensor, False)[:, ::2, ::2, :] + + self.assertAllClose(ivb_block1.get_weights(), ivb_block2.get_weights(), + atol=1e-4, rtol=1e-4) + self.assertAllClose(output_tensor, expected, atol=1e-4, rtol=1e-4) + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/convolutions.py b/model/layers/convolutions.py new file mode 100644 index 0000000000000000000000000000000000000000..b24ab892c82e249d27f0ab870939756c6c78af68 --- /dev/null +++ b/model/layers/convolutions.py @@ -0,0 +1,666 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains wrapper classes for convolution layers of tf.keras and Switchable Atrous Convolution. + +Switchable Atrous Convolution (SAC) is convolution with a switchable atrous +rate. It also has optional pre- and post-global context layers. +[1] Siyuan Qiao, Liang-Chieh Chen, Alan Yuille. DetectoRS: Detecting Objects + with Recursive Feature Pyramid and Switchable Atrous Convolution. + arXiv:2006.02334 +""" +import functools +from typing import Optional + +from absl import logging +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import activations + + +def _compute_padding_size(kernel_size, atrous_rate): + kernel_size_effective = kernel_size + (kernel_size - 1) * (atrous_rate - 1) + pad_total = kernel_size_effective - 1 + pad_begin = pad_total // 2 + pad_end = pad_total - pad_begin + if pad_begin != pad_end: + logging.warn('Convolution requires one more padding to the ' + 'bottom-right pixel. This may cause misalignment.') + return (pad_begin, pad_end) + + +class GlobalContext(tf.keras.layers.Layer): + """Class for the global context modules in Switchable Atrous Convolution.""" + + def build(self, input_shape): + super().build(input_shape) + input_shape = tf.TensorShape(input_shape) + input_channel = self._get_input_channel(input_shape) + self.global_average_pooling = tf.keras.layers.GlobalAveragePooling2D() + self.convolution = tf.keras.layers.Conv2D( + input_channel, 1, strides=1, padding='same', name=self.name + '_conv', + kernel_initializer='zeros', bias_initializer='zeros') + + def call(self, inputs, *args, **kwargs): + outputs = self.global_average_pooling(inputs) + outputs = tf.expand_dims(outputs, axis=1) + outputs = tf.expand_dims(outputs, axis=1) + outputs = self.convolution(outputs) + return inputs + outputs + + def _get_input_channel(self, input_shape): + # Reference: tf.keras.layers.convolutional.Conv. + if input_shape.dims[-1].value is None: + raise ValueError('The channel dimension of the inputs ' + 'should be defined. Found `None`.') + return int(input_shape[-1]) + + +class SwitchableAtrousConvolution(tf.keras.layers.Conv2D): + """Class for the Switchable Atrous Convolution.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._average_pool = tf.keras.layers.AveragePooling2D( + pool_size=(5, 5), strides=1, padding='same') + self._switch = tf.keras.layers.Conv2D( + 1, + kernel_size=1, + strides=self.strides, + padding='same', + dilation_rate=1, + name='switch', + kernel_initializer='zeros', + bias_initializer='zeros') + + def build(self, input_shape): + super().build(input_shape) + if self.padding == 'causal': + tf_padding = 'VALID' + elif isinstance(self.padding, str): + tf_padding = self.padding.upper() + else: + tf_padding = self.padding + large_dilation_rate = list(self.dilation_rate) + large_dilation_rate = [r * 3 for r in large_dilation_rate] + self._large_convolution_op = functools.partial( + tf.nn.convolution, + strides=list(self.strides), + padding=tf_padding, + dilations=large_dilation_rate, + data_format=self._tf_data_format, + name=self.__class__.__name__ + '_large') + + def call(self, inputs): + # Reference: tf.keras.layers.convolutional.Conv. + input_shape = inputs.shape + switches = self._switch(self._average_pool(inputs)) + + if self._is_causal: # Apply causal padding to inputs for Conv1D. + inputs = tf.compat.v1.pad(inputs, self._compute_causal_padding(inputs)) + + outputs = self._convolution_op(inputs, self.kernel) + outputs_large = self._large_convolution_op(inputs, self.kernel) + + outputs = switches * outputs_large + (1 - switches) * outputs + + if self.use_bias: + outputs = tf.nn.bias_add( + outputs, self.bias, data_format=self._tf_data_format) + + if not tf.executing_eagerly(): + # Infer the static output shape: + out_shape = self.compute_output_shape(input_shape) + outputs.set_shape(out_shape) + + if self.activation is not None: + return self.activation(outputs) + return outputs + + def squeeze_batch_dims(self, inp, op, inner_rank): + # Reference: tf.keras.utils.conv_utils.squeeze_batch_dims. + with tf.name_scope('squeeze_batch_dims'): + shape = inp.shape + + inner_shape = shape[-inner_rank:] + if not inner_shape.is_fully_defined(): + inner_shape = tf.compat.v1.shape(inp)[-inner_rank:] + + batch_shape = shape[:-inner_rank] + if not batch_shape.is_fully_defined(): + batch_shape = tf.compat.v1.shape(inp)[:-inner_rank] + + if isinstance(inner_shape, tf.TensorShape): + inp_reshaped = tf.reshape(inp, [-1] + inner_shape.as_list()) + else: + inp_reshaped = tf.reshape( + inp, tf.concat(([-1], inner_shape), axis=-1)) + + out_reshaped = op(inp_reshaped) + + out_inner_shape = out_reshaped.shape[-inner_rank:] + if not out_inner_shape.is_fully_defined(): + out_inner_shape = tf.compat.v1.shape(out_reshaped)[-inner_rank:] + + out = tf.reshape( + out_reshaped, tf.concat((batch_shape, out_inner_shape), axis=-1)) + + out.set_shape(inp.shape[:-inner_rank] + out.shape[-inner_rank:]) + return out + + +class Conv2DSame(tf.keras.layers.Layer): + """A wrapper class for a 2D convolution with 'same' padding. + + In contrast to tf.keras.layers.Conv2D, this layer aligns the kernel with the + top-left corner rather than the bottom-right corner. Optionally, a batch + normalization and an activation can be added on top. + """ + + def __init__( + self, + output_channels: int, + kernel_size: int, + name: str, + strides: int = 1, + atrous_rate: int = 1, + use_bias: bool = True, + use_bn: bool = False, + bn_layer: tf.keras.layers.Layer = tf.keras.layers.BatchNormalization, + bn_gamma_initializer: str = 'ones', + activation: Optional[str] = None, + use_switchable_atrous_conv: bool = False, + use_global_context_in_sac: bool = False, + conv_kernel_weight_decay: float = 0.0): + """Initializes convolution with zero padding aligned to the top-left corner. + + DeepLab aligns zero padding differently to tf.keras 'same' padding. + Considering a convolution with a 7x7 kernel, a stride of 2 and an even input + size, tf.keras 'same' padding will add 2 zero padding to the top-left and 3 + zero padding to the bottom-right. However, for consistent feature alignment, + DeepLab requires an equal padding of 3 in all directions. This behavior is + consistent with e.g. the ResNet 'stem' block. + + Args: + output_channels: An integer specifying the number of filters of the + convolution. + kernel_size: An integer specifying the size of the convolution kernel. + name: A string specifying the name of this layer. + strides: An optional integer or tuple of integers specifying the size of + the strides (default: 1). + atrous_rate: An optional integer or tuple of integers specifying the + atrous rate of the convolution (default: 1). + use_bias: An optional flag specifying whether bias should be added for the + convolution. + use_bn: An optional flag specifying whether batch normalization should be + added after the convolution (default: False). + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + bn_gamma_initializer: An initializer for the batch norm gamma weight. + activation: An optional flag specifying an activation function to be added + after the convolution. + use_switchable_atrous_conv: Boolean, whether the layer uses switchable + atrous convolution. + use_global_context_in_sac: Boolean, whether the switchable atrous + convolution (SAC) uses pre- and post-global context. + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + + Raises: + ValueError: If use_bias and use_bn in the convolution. + """ + super(Conv2DSame, self).__init__(name=name) + + if use_bn and use_bias: + raise ValueError('Conv2DSame is using convolution bias with batch_norm.') + + if use_global_context_in_sac: + self._pre_global_context = GlobalContext(name='pre_global_context') + + convolution_op = tf.keras.layers.Conv2D + convolution_padding = 'same' + if strides == 1 or strides == (1, 1): + if use_switchable_atrous_conv: + convolution_op = SwitchableAtrousConvolution + else: + padding = _compute_padding_size(kernel_size, atrous_rate) + self._zeropad = tf.keras.layers.ZeroPadding2D( + padding=(padding, padding), name='zeropad') + convolution_padding = 'valid' + self._conv = convolution_op( + output_channels, + kernel_size, + strides=strides, + padding=convolution_padding, + use_bias=use_bias, + dilation_rate=atrous_rate, + name='conv', + kernel_initializer='he_normal', + kernel_regularizer=tf.keras.regularizers.l2( + conv_kernel_weight_decay)) + + if use_global_context_in_sac: + self._post_global_context = GlobalContext(name='post_global_context') + + if use_bn: + self._batch_norm = bn_layer(axis=3, name='batch_norm', + gamma_initializer=bn_gamma_initializer) + + self._activation_fn = None + if activation is not None: + self._activation_fn = activations.get_activation(activation) + + self._use_global_context_in_sac = use_global_context_in_sac + self._strides = strides + self._use_bn = use_bn + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + The output tensor. + """ + x = input_tensor + if self._use_global_context_in_sac: + x = self._pre_global_context(x) + + if not (self._strides == 1 or self._strides == (1, 1)): + x = self._zeropad(x) + x = self._conv(x) + + if self._use_global_context_in_sac: + x = self._post_global_context(x) + + if self._use_bn: + x = self._batch_norm(x, training=training) + + if self._activation_fn is not None: + x = self._activation_fn(x) + return x + + +class DepthwiseConv2DSame(tf.keras.layers.Layer): + """A wrapper class for a 2D depthwise convolution. + + In contrast to convolutions in tf.keras.layers.DepthwiseConv2D, this layers + aligns the kernel with the top-left corner rather than the bottom-right + corner. Optionally, a batch normalization and an activation can be added. + """ + + def __init__(self, + kernel_size: int, + name: str, + strides: int = 1, + atrous_rate: int = 1, + use_bias: bool = True, + use_bn: bool = False, + bn_layer=tf.keras.layers.BatchNormalization, + activation: Optional[str] = None): + """Initializes a 2D depthwise convolution. + + Args: + kernel_size: An integer specifying the size of the convolution kernel. + name: A string specifying the name of this layer. + strides: An optional integer or tuple of integers specifying the size of + the strides (default: 1). + atrous_rate: An optional integer or tuple of integers specifying the + atrous rate of the convolution (default: 1). + use_bias: An optional flag specifying whether bias should be added for the + convolution. + use_bn: An optional flag specifying whether batch normalization should be + added after the convolution (default: False). + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + activation: An optional flag specifying an activation function to be added + after the convolution. + + Raises: + ValueError: If use_bias and use_bn in the convolution. + """ + super(DepthwiseConv2DSame, self).__init__(name=name) + + if use_bn and use_bias: + raise ValueError( + 'DepthwiseConv2DSame is using convlution bias with batch_norm.') + + if strides == 1 or strides == (1, 1): + convolution_padding = 'same' + else: + padding = _compute_padding_size(kernel_size, atrous_rate) + self._zeropad = tf.keras.layers.ZeroPadding2D( + padding=(padding, padding), name='zeropad') + convolution_padding = 'valid' + self._depthwise_conv = tf.keras.layers.DepthwiseConv2D( + kernel_size=kernel_size, + strides=strides, + padding=convolution_padding, + use_bias=use_bias, + dilation_rate=atrous_rate, + name='depthwise_conv') + if use_bn: + self._batch_norm = bn_layer(axis=3, name='batch_norm') + + self._activation_fn = None + if activation is not None: + self._activation_fn = activations.get_activation(activation) + + self._strides = strides + self._use_bn = use_bn + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + The output tensor. + """ + x = input_tensor + if not (self._strides == 1 or self._strides == (1, 1)): + x = self._zeropad(x) + x = self._depthwise_conv(x) + if self._use_bn: + x = self._batch_norm(x, training=training) + if self._activation_fn is not None: + x = self._activation_fn(x) + return x + + +class SeparableConv2DSame(tf.keras.layers.Layer): + """A wrapper class for a 2D separable convolution. + + In contrast to convolutions in tf.keras.layers.SeparableConv2D, this layers + aligns the kernel with the top-left corner rather than the bottom-right + corner. Optionally, a batch normalization and an activation can be added. + """ + + def __init__( + self, + output_channels: int, + kernel_size: int, + name: str, + strides: int = 1, + atrous_rate: int = 1, + use_bias: bool = True, + use_bn: bool = False, + bn_layer: tf.keras.layers.Layer = tf.keras.layers.BatchNormalization, + activation: Optional[str] = None): + """Initializes a 2D separable convolution. + + Args: + output_channels: An integer specifying the number of filters of the + convolution output. + kernel_size: An integer specifying the size of the convolution kernel. + name: A string specifying the name of this layer. + strides: An optional integer or tuple of integers specifying the size of + the strides (default: 1). + atrous_rate: An optional integer or tuple of integers specifying the + atrous rate of the convolution (default: 1). + use_bias: An optional flag specifying whether bias should be added for the + convolution. + use_bn: An optional flag specifying whether batch normalization should be + added after the convolution (default: False). + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + activation: An optional flag specifying an activation function to be added + after the convolution. + + Raises: + ValueError: If use_bias and use_bn in the convolution. + """ + super(SeparableConv2DSame, self).__init__(name=name) + if use_bn and use_bias: + raise ValueError( + 'SeparableConv2DSame is using convolution bias with batch_norm.') + + self._depthwise = DepthwiseConv2DSame( + kernel_size=kernel_size, + name='depthwise', + strides=strides, + atrous_rate=atrous_rate, + use_bias=use_bias, + use_bn=use_bn, + bn_layer=bn_layer, + activation=activation) + self._pointwise = Conv2DSame( + output_channels=output_channels, + kernel_size=1, + name='pointwise', + strides=1, + atrous_rate=1, + use_bias=use_bias, + use_bn=use_bn, + bn_layer=bn_layer, + activation=activation) + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + The output tensor. + """ + x = self._depthwise(input_tensor, training=training) + return self._pointwise(x, training=training) + + +class StackedConv2DSame(tf.keras.layers.Layer): + """Stacked Conv2DSame or SeparableConv2DSame. + + This class sequentially stacks a given number of Conv2DSame layers or + SeparableConv2DSame layers. + """ + + def __init__( + self, + num_layers: int, + conv_type: str, + output_channels: int, + kernel_size: int, + name: str, + strides: int = 1, + atrous_rate: int = 1, + use_bias: bool = True, + use_bn: bool = False, + bn_layer: tf.keras.layers.Layer = tf.keras.layers.BatchNormalization, + activation: Optional[str] = None): + """Initializes a stack of convolutions. + + Args: + num_layers: The number of convolutions to create. + conv_type: A string specifying the convolution type used in each block. + Must be one of 'standard_conv' or 'depthwise_separable_conv'. + output_channels: An integer specifying the number of filters of the + convolution output. + kernel_size: An integer specifying the size of the convolution kernel. + name: A string specifying the name of this layer. + strides: An optional integer or tuple of integers specifying the size of + the strides (default: 1). + atrous_rate: An optional integer or tuple of integers specifying the + atrous rate of the convolution (default: 1). + use_bias: An optional flag specifying whether bias should be added for the + convolution. + use_bn: An optional flag specifying whether batch normalization should be + added after the convolution (default: False). + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + activation: An optional flag specifying an activation function to be added + after the convolution. + + Raises: + ValueError: An error occurs when conv_type is neither 'standard_conv' + nor 'depthwise_separable_conv'. + """ + super(StackedConv2DSame, self).__init__(name=name) + if conv_type == 'standard_conv': + convolution_op = Conv2DSame + elif conv_type == 'depthwise_separable_conv': + convolution_op = SeparableConv2DSame + else: + raise ValueError('Convolution %s not supported.' % conv_type) + + for index in range(num_layers): + current_name = utils.get_conv_bn_act_current_name(index, use_bn, + activation) + utils.safe_setattr(self, current_name, convolution_op( + output_channels=output_channels, + kernel_size=kernel_size, + name=utils.get_layer_name(current_name), + strides=strides, + atrous_rate=atrous_rate, + use_bias=use_bias, + use_bn=use_bn, + bn_layer=bn_layer, + activation=activation)) + self._num_layers = num_layers + self._use_bn = use_bn + self._activation = activation + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + The output tensor. + """ + x = input_tensor + for index in range(self._num_layers): + current_name = utils.get_conv_bn_act_current_name(index, self._use_bn, + self._activation) + x = getattr(self, current_name)(x, training=training) + return x + + +class Conv1D(tf.keras.layers.Layer): + """A wrapper class for a 1D convolution with batch norm and activation. + + Conv1D creates a convolution kernel that is convolved with the layer input + over a single spatial (or temporal) dimension to produce a tensor of outputs. + The input should always be 3D with shape [batch, length, channel], so + accordingly, the optional batch norm is done on axis=2. + + In DeepLab, we use Conv1D only with kernel_size = 1 for dual path transformer + layers in MaX-DeepLab [1] architectures. + + Reference: + [1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers, + CVPR 2021. + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + """ + + def __init__( + self, + output_channels: int, + name: str, + use_bias: bool = True, + use_bn: bool = False, + bn_layer: tf.keras.layers.Layer = tf.keras.layers.BatchNormalization, + bn_gamma_initializer: str = 'ones', + activation: Optional[str] = None, + conv_kernel_weight_decay: float = 0.0, + kernel_initializer='he_normal', + kernel_size: int = 1, + padding: str = 'valid'): + """Initializes a Conv1D. + + Args: + output_channels: An integer specifying the number of filters of the + convolution. + name: A string specifying the name of this layer. + use_bias: An optional flag specifying whether bias should be added for the + convolution. + use_bn: An optional flag specifying whether batch normalization should be + added after the convolution (default: False). + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + bn_gamma_initializer: An initializer for the batch norm gamma weight. + activation: An optional flag specifying an activation function to be added + after the convolution. + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + kernel_initializer: An initializer for the convolution kernel. + kernel_size: An integer specifying the size of the convolution kernel. + padding: An optional string specifying the padding to use. Must be either + 'same' or 'valid' (default: 'valid'). + + Raises: + ValueError: If use_bias and use_bn in the convolution. + """ + super(Conv1D, self).__init__(name=name) + + if use_bn and use_bias: + raise ValueError('Conv1D is using convlution bias with batch_norm.') + + self._conv = tf.keras.layers.Conv1D( + output_channels, + kernel_size=kernel_size, + strides=1, + padding=padding, + use_bias=use_bias, + name='conv', + kernel_initializer=kernel_initializer, + kernel_regularizer=tf.keras.regularizers.l2( + conv_kernel_weight_decay)) + + self._batch_norm = None + if use_bn: + # Batch norm uses axis=2 because the input is 3D with channel being the + # last dimension. + self._batch_norm = bn_layer(axis=2, name='batch_norm', + gamma_initializer=bn_gamma_initializer) + + self._activation_fn = None + if activation is not None: + self._activation_fn = activations.get_activation(activation) + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, length, + channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + The output tensor. + """ + x = self._conv(input_tensor) + if self._batch_norm is not None: + x = self._batch_norm(x, training=training) + if self._activation_fn is not None: + x = self._activation_fn(x) + return x diff --git a/model/layers/convolutions_test.py b/model/layers/convolutions_test.py new file mode 100644 index 0000000000000000000000000000000000000000..676135cba31b82a582ae8f04c424e55b839dbcff --- /dev/null +++ b/model/layers/convolutions_test.py @@ -0,0 +1,290 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for convolutions.""" + +import numpy as np +import tensorflow as tf + +from deeplab2.model.layers import convolutions +from deeplab2.utils import test_utils + + +class ConvolutionsTest(tf.test.TestCase): + + def test_conv2dsame_logging(self): + with self.assertLogs(level='WARN'): + _ = convolutions.Conv2DSame( + output_channels=1, + kernel_size=8, + strides=2, + name='conv', + use_bn=False, + activation=None) + + def test_conv2dsame_conv(self): + conv = convolutions.Conv2DSame( + output_channels=1, + kernel_size=1, + name='conv', + use_bn=False, + activation=None) + input_tensor = tf.random.uniform(shape=(2, 180, 180, 5)) + + predicted_tensor = conv(input_tensor) + expected_tensor = np.dot(input_tensor.numpy(), + conv._conv.get_weights()[0])[..., 0, 0] + + # Compare only up to 5 decimal digits to account for numerical accuracy. + np.testing.assert_almost_equal( + predicted_tensor.numpy(), expected_tensor, decimal=5) + + def test_conv2dsame_relu(self): + conv = convolutions.Conv2DSame( + output_channels=1, + kernel_size=1, + name='conv', + activation='relu', + use_bn=False) + input_tensor = tf.random.uniform(shape=(2, 180, 180, 5)) + + predicted_tensor = conv(input_tensor) + expected_tensor = np.dot(input_tensor.numpy(), + conv._conv.get_weights()[0])[..., 0, 0] + expected_tensor[expected_tensor < 0.0] = 0.0 + + # Compare only up to 5 decimal digits to account for numerical accuracy. + np.testing.assert_almost_equal( + predicted_tensor.numpy(), expected_tensor, decimal=5) + + def test_conv2dsame_relu6(self): + conv = convolutions.Conv2DSame( + output_channels=1, + kernel_size=1, + name='conv', + activation='relu6', + use_bn=False) + input_tensor = tf.random.uniform(shape=(2, 180, 180, 5)) * 10. + + predicted_tensor = conv(input_tensor) + expected_tensor = np.dot(input_tensor.numpy(), + conv._conv.get_weights()[0])[..., 0, 0] + expected_tensor[expected_tensor < 0.0] = 0.0 + expected_tensor[expected_tensor > 6.0] = 6.0 + + # Compare only up to 5 decimal digits to account for numerical accuracy. + np.testing.assert_almost_equal( + predicted_tensor.numpy(), expected_tensor, decimal=5) + + def test_conv2dsame_shape(self): + conv = convolutions.Conv2DSame( + output_channels=64, + kernel_size=7, + strides=2, + name='conv', + use_bias=False, + use_bn=True) + input_tensor = tf.random.uniform(shape=(2, 180, 180, 3)) + + predicted_tensor = conv(input_tensor) + expected_shape = [2, 90, 90, 64] + + self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape) + + @test_utils.test_all_strategies + def test_conv2d_sync_bn(self, strategy): + input_tensor = tf.random.uniform(shape=(2, 180, 180, 3)) + + for bn_layer in test_utils.NORMALIZATION_LAYERS: + with strategy.scope(): + conv = convolutions.Conv2DSame( + output_channels=64, + kernel_size=7, + strides=2, + name='conv', + use_bias=False, + use_bn=True, + bn_layer=bn_layer) + conv(input_tensor) + + def test_depthwise_conv(self): + conv = convolutions.DepthwiseConv2DSame( + kernel_size=1, use_bn=False, use_bias=True, activation=None, + name='conv') + input_tensor = tf.random.uniform(shape=(2, 180, 180, 5)) + + predicted_tensor = conv(input_tensor) + expected_tensor = ( + input_tensor.numpy() * conv._depthwise_conv.get_weights()[0][..., 0]) + + np.testing.assert_equal(predicted_tensor.numpy(), expected_tensor) + + def test_depthwise_relu(self): + conv = convolutions.DepthwiseConv2DSame( + kernel_size=1, use_bn=False, activation='relu', name='conv') + input_tensor = tf.random.uniform(shape=(2, 180, 180, 5)) + + predicted_tensor = conv(input_tensor) + expected_tensor = ( + input_tensor.numpy() * conv._depthwise_conv.get_weights()[0][..., 0]) + expected_tensor[expected_tensor < 0.0] = 0.0 + + np.testing.assert_equal(predicted_tensor.numpy(), expected_tensor) + + def test_depthwise_shape(self): + conv = convolutions.DepthwiseConv2DSame( + kernel_size=3, use_bn=True, use_bias=False, activation='relu', + name='conv') + input_shape = [2, 180, 180, 5] + input_tensor = tf.random.uniform(shape=input_shape) + + predicted_tensor = conv(input_tensor) + expected_shape = input_shape + + self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape) + + def test_depthwise_shape_with_stride2(self): + conv = convolutions.DepthwiseConv2DSame( + kernel_size=3, use_bn=True, use_bias=False, activation='relu', + strides=2, name='conv') + input_shape = [2, 181, 181, 5] + input_tensor = tf.random.uniform(shape=input_shape) + + predicted_tensor = conv(input_tensor) + expected_shape = [2, 91, 91, 5] + + self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape) + + @test_utils.test_all_strategies + def test_depthwise_sync_bn(self, strategy): + input_tensor = tf.random.uniform(shape=(2, 180, 180, 3)) + + for bn_layer in test_utils.NORMALIZATION_LAYERS: + with strategy.scope(): + conv = convolutions.DepthwiseConv2DSame( + kernel_size=7, + name='conv', + use_bn=True, + use_bias=False, + bn_layer=bn_layer, + activation='relu') + _ = conv(input_tensor) + + def test_global_context(self): + input_tensor = tf.random.uniform(shape=(2, 180, 180, 3)) + global_context = convolutions.GlobalContext(name='global_context') + output_tensor = global_context(input_tensor) + # global_context is supposed to not change any values before training. + np.testing.assert_array_almost_equal(input_tensor.numpy(), + output_tensor.numpy()) + + def test_switchable_atrous_conv_class(self): + # Tests Switchable Atrous Convolution by equations. + input_tensor = tf.random.uniform(shape=(3, 180, 180, 32)) + sac_layer = convolutions.SwitchableAtrousConvolution( + 64, + kernel_size=3, + padding='same', + name='sac_conv') + switch_conv = sac_layer._switch + _ = switch_conv(input_tensor) + switch_conv.kernel = tf.random.uniform( + switch_conv.kernel.shape, + minval=-1, + maxval=1, + dtype=switch_conv.kernel.dtype) + switch_conv.bias = tf.random.uniform( + switch_conv.bias.shape, + minval=-1, + maxval=1, + dtype=switch_conv.bias.dtype) + small_conv = tf.keras.layers.Conv2D( + 64, + kernel_size=3, + padding='same', + dilation_rate=1, + name='small_conv') + large_conv = tf.keras.layers.Conv2D( + 64, + kernel_size=3, + padding='same', + dilation_rate=3, + name='large_conv') + _ = small_conv(input_tensor) + _ = large_conv(input_tensor) + outputs = sac_layer(input_tensor) + small_conv.kernel = sac_layer.kernel + large_conv.kernel = sac_layer.kernel + # Compute the expected outputs. + switch_outputs = sac_layer._switch(sac_layer._average_pool(input_tensor)) + large_outputs = large_conv(input_tensor) + small_outputs = small_conv(input_tensor) + expected_outputs = (switch_outputs * large_outputs + + (1 - switch_outputs) * small_outputs) + np.testing.assert_array_almost_equal(expected_outputs.numpy(), + outputs.numpy()) + + def test_switchable_atrous_conv_in_conv2dsame(self): + # Tests Switchable Atrous Convolution in Conv2DSame. + input_tensor = tf.random.uniform(shape=(3, 180, 180, 32)) + layer = convolutions.Conv2DSame( + output_channels=64, + kernel_size=7, + strides=1, + name='conv', + use_bias=False, + use_bn=True, + use_switchable_atrous_conv=True, + use_global_context_in_sac=True) + output_tensor = layer(input_tensor) + np.testing.assert_array_almost_equal(output_tensor.shape.as_list(), + [3, 180, 180, 64]) + + def test_conv1d_shape(self): + conv = convolutions.Conv1D( + output_channels=64, + name='conv', + use_bias=False, + use_bn=True) + input_tensor = tf.random.uniform(shape=(2, 180, 3)) + predicted_tensor = conv(input_tensor) + expected_shape = [2, 180, 64] + self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape) + + def test_separable_conv2d_same_output_shape(self): + conv = convolutions.SeparableConv2DSame( + output_channels=64, + kernel_size=3, + name='conv') + input_tensor = tf.random.uniform(shape=(2, 5, 5, 3)) + predicted_tensor = conv(input_tensor) + expected_shape = [2, 5, 5, 64] + self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape) + + def test_stacked_conv2d_same_output_shape(self): + conv = convolutions.StackedConv2DSame( + num_layers=2, + conv_type='depthwise_separable_conv', + output_channels=64, + kernel_size=3, + name='conv') + input_tensor = tf.random.uniform(shape=(2, 5, 5, 3)) + predicted_tensor = conv(input_tensor) + expected_shape = [2, 5, 5, 64] + self.assertListEqual(predicted_tensor.shape.as_list(), expected_shape) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/drop_path.py b/model/layers/drop_path.py new file mode 100644 index 0000000000000000000000000000000000000000..b07f820b52451245725a664b52ea294fa77ebe69 --- /dev/null +++ b/model/layers/drop_path.py @@ -0,0 +1,133 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Drop path operation. + +This scripts implements the drop path operation, proposed in +Gao Huang, Yu Sun, Zhuang Liu, Daniel Sedra, Kilian Weinberger, +Deep Networks with Stochastic Depth. In ECCV, 2016. +""" +import tensorflow as tf + + +def get_drop_path_keep_prob(keep_prob_for_last_stage, schedule, + current_stage, num_stages): + """Gets drop path keep probability for current stage. + + Args: + keep_prob_for_last_stage: A float, the drop path keep probability for + last stage. This flag is used in conjunction with the flag `schedule`, as + they together determine drop path keep probability for the other stages. + schedule: A string, the drop path schedule. Currently, we support + 'constant': use the same drop path keep probability for all stages, and + 'linear': linearly decrease the drop path keep probability from 1.0 at + 0-th stage (or STEM) to `keep_prob_for_last_stage` at last stage. + current_stage: An integer, current stage number. + num_stages: An integer, the number of stages. + + Returns: + The drop path keep probability for the current stage. + + Raises: + ValueError: If schedule is not supported. + """ + if schedule == 'constant': + return keep_prob_for_last_stage + elif schedule == 'linear': + return 1.0 - (1.0 - keep_prob_for_last_stage) * current_stage / num_stages + else: + raise ValueError('Unexpected schedule %s.' % schedule) + + +def generate_drop_path_random_mask(input_tensor, drop_path_keep_prob): + """Generates a random mask for drop path. + + This function generates a random mask for training models with drop path. Each + scalar in the output indicates whether the block or path will be kept. The + scalars are scaled with (1.0 / drop_path_keep_prob) so that the output will + have the same expectation no mather what the drop_path_keep_prob is. + + Reference: + "Deep Networks with Stochastic Depth" https://arxiv.org/pdf/1603.09382.pdf + + Args: + input_tensor: An input [batch_size, n_1, n_2, ..., n_k] tensor. + drop_path_keep_prob: A float, the keep probability for dropping path. + + Returns: + binary_tensor: A [batch_size, 1, 1, ..., 1] tensor with the same dtype as + the input_tensor. + """ + binary_tensor = None + if drop_path_keep_prob < 1.0: + input_shape = input_tensor.get_shape().as_list() + random_tensor_shape = [input_shape[0]] + [1] * (len(input_shape) - 1) + random_tensor = drop_path_keep_prob + random_tensor += tf.random.uniform( + random_tensor_shape, dtype=input_tensor.dtype) + binary_tensor = tf.math.divide(tf.floor(random_tensor), drop_path_keep_prob) + return binary_tensor + + +class DropPath(tf.keras.layers.Layer): + """Drop path layer. + + For details, please see the original paper listed below. + Gao Huang, Yu Sun, Zhuang Liu, Daniel Sedra, Kilian Weinberger, + Deep Networks with Stochastic Depth. In ECCV, 2016. + """ + + def __init__(self, drop_path_keep_prob=1.0, name=None): + """Initializes a drop path layer. + + Args: + drop_path_keep_prob: A float, the keep probability for dropping path. + name: An optional string specifying the operation name. + + Rasies: + ValueError: If drop_path_keep_prob is <= 0 or > 1. + """ + super(DropPath, self).__init__(name=name) + self._drop_path_keep_prob = drop_path_keep_prob + if self._drop_path_keep_prob <= 0 or self._drop_path_keep_prob > 1.0: + raise ValueError('drop_path_keep_prob not valid. Got %f.' % + self._drop_path_keep_prob) + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + The output tensor. + """ + if self._drop_path_keep_prob == 1.0 or not training: + return input_tensor + drop_path_random_mask = generate_drop_path_random_mask( + input_tensor, self._drop_path_keep_prob) + if drop_path_random_mask is not None: + input_tensor = input_tensor * drop_path_random_mask + return input_tensor + + def get_config(self): + config = { + 'drop_path_keep_prob': self._drop_path_keep_prob, + } + base_config = super(DropPath, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/model/layers/drop_path_test.py b/model/layers/drop_path_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7d02f5fa9d2de935cdeb043bfbad81441e0b1b6f --- /dev/null +++ b/model/layers/drop_path_test.py @@ -0,0 +1,76 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test for drop_path.py.""" +import numpy as np +import tensorflow as tf + +from deeplab2.model.layers import drop_path + +# Set a fixed random seed. +tf.random.set_seed(1) + + +class DropPathTest(tf.test.TestCase): + + def test_drop_path_keep_prob_one(self): + # Test drop_path_keep_prob = 1, where output should be equal to input. + drop_path_keep_prob = 1.0 + input_tensor = tf.random.uniform(shape=(3, 65, 65, 32)) + layer_op = drop_path.DropPath(drop_path_keep_prob) + output_tensor = layer_op(input_tensor, training=True) + np.testing.assert_equal(input_tensor.numpy(), output_tensor.numpy()) + + def test_not_training_mode(self): + # Test not training mode, where output should be equal to input. + drop_path_keep_prob = 0.8 + input_tensor = tf.random.uniform(shape=(3, 65, 65, 32)) + layer_op = drop_path.DropPath(drop_path_keep_prob) + output_tensor = layer_op(input_tensor, training=False) + np.testing.assert_equal(input_tensor.numpy(), output_tensor.numpy()) + + def test_drop_path(self): + drop_path_keep_prob = 0.8 + input_tensor = tf.random.uniform(shape=(3, 65, 65, 32)) + layer_op = drop_path.DropPath(drop_path_keep_prob) + output_tensor = layer_op(input_tensor, training=True) + self.assertFalse(np.array_equal(input_tensor.numpy(), + output_tensor.numpy())) + + def test_constant_drop_path_schedule(self): + keep_prob_for_last_stage = 0.8 + current_stage_keep_prob = drop_path.get_drop_path_keep_prob( + keep_prob_for_last_stage, + schedule='constant', + current_stage=2, + num_stages=5) + self.assertEqual(current_stage_keep_prob, keep_prob_for_last_stage) + + def test_linear_drop_path_schedule(self): + keep_prob_for_last_stage = 0.8 + current_stage_keep_prob = drop_path.get_drop_path_keep_prob( + keep_prob_for_last_stage, + schedule='linear', + current_stage=1, + num_stages=4) + self.assertEqual(current_stage_keep_prob, 0.95) + + def test_unknown_drop_path_schedule(self): + with self.assertRaises(ValueError): + _ = drop_path.get_drop_path_keep_prob(0.8, 'unknown', 1, 4) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/dual_path_transformer.py b/model/layers/dual_path_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..806db522ac2ece7304d7d4fb481d85274614e580 --- /dev/null +++ b/model/layers/dual_path_transformer.py @@ -0,0 +1,488 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implements dual path transformer layers proposed in MaX-DeepLab [1]. + +Dual-path transformer introduces a global memory path in addition to a CNN path, +allowing bi-directional communication with any CNN layers. + +[1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers, + CVPR 2021. + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. +""" + +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import activations +from deeplab2.model.layers import convolutions + + +class AttentionOperation(tf.keras.layers.Layer): + """Computes standard 1D multi-head attention with query, key, and value.""" + + def __init__(self, + name, + activation, + transformer_activation, + bn_layer=tf.keras.layers.BatchNormalization): + """Initializes an AttentionOperation layer. + + Args: + name: A string, the name of this layer. + activation: A string, type of activation function to apply. + transformer_activation: A string, type of activation function for + self-attention. Support 'sigmoid' and 'softmax'. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + """ + super(AttentionOperation, self).__init__(name=name) + # batch_norm_similarity has shape [batch, num_heads, num_query, num_key], + # where num_query and num_key usually equals to height or width or length, + # i.e., spatial dimensions, so batch norm is applied to axis=1 only. + self._batch_norm_similarity = bn_layer(axis=1, name='batch_norm_similarity') + # batch_norm_retrieved_value is done on shape [batch, num_heads, length, + # value_channels], which will be reshaped to the output shape [batch, + # length, value_channels * num_heads], so we apply batch norm on the + # effective channel dimension -- value_channels * num_heads. + self._batch_norm_retrieved_value = bn_layer( + axis=[1, 3], name='batch_norm_retrieved_value') + self._activation_fn = activations.get_activation(activation) + self._transformer_activation_fn = activations.get_activation( + transformer_activation) + + def call(self, inputs, training=False): + """Performs an AttentionOperation. + + Args: + inputs: A tuple of (query, key, value), where query is [batch, num_head, + query_length, channels] tensor, key is a [batch, num_head, key_length, + channels] tensor, and value is a [batch, key_length, num_head, + value_channels] tensor. + training: A boolean, whether the model is in training mode. + + Returns: + output: A [batch, query_length, num_head * value_channels] tensor, the + retrieved value. + """ + # Decode query, key, and value from inputs. + query, key, value = inputs + # Compute attention similarity. + similarity_logits = tf.einsum('bhld,bhmd->bhlm', query, key) + similarity_logits = self._batch_norm_similarity( + similarity_logits, training=training) + # Apply a transformer attention activation function, e.g. softmax. + attention_weights = self._transformer_activation_fn(similarity_logits) + # Retrieve the value content. + retrieved_value = tf.einsum( + 'bhlm,bmhd->bhld', attention_weights, value) + retrieved_value = self._batch_norm_retrieved_value( + retrieved_value, training=training) + retrieved_value = self._activation_fn(retrieved_value) + # Reshape the output. + return utils.transpose_and_reshape_for_attention_operation( + retrieved_value) + + +class DualPathTransformerLayer(tf.keras.layers.Layer): + """Applies a dual path transformer layer, as proposed in MaX-DeepLab [1]. + + Dual-path transformer layer takes a pixel space input and a memory space + input, and performs memory2pixel attention, pixel2memory attention, and + memory2memory self-attention. Note that the pixel2pixel self-attention or + convolution in the pixel space is implemented in axial_layers.py and + axial_blocks.py. Thus, the pixel2pixel operation is not included in this + DualPathTransformerLayer implementation. Please use this class together with + a residual block with axial-attention, global-attention, or convolution in + order to construct the full dual path transformer in the paper. + + [1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers, + CVPR 2021. + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + """ + + def __init__(self, + name='dual_path_transformer_layer', + activation='relu', + filters=128, + num_heads=8, + bottleneck_expansion=2, + key_expansion=1, + value_expansion=2, + feed_forward_network_channels=2048, + use_memory_self_attention=True, + use_pixel2memory_feedback_attention=True, + transformer_activation='softmax', + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0): + """Initializes a DualPathTransformerLayer. + + This function implements a dual path transformer layer between a pixel space + and a memory space, as described in the MaX-DeepLab paper. In this dual path + transformer, the memory2pixel cross attention and the memory self-attention + share a single activation, e.g. softmax. + + Reference: + MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + + Args: + name: A string, the name of this dual path transformer layer. + activation: A string, type of activation function to apply. + filters: An integer, the base number of channels for the layer. + num_heads: An integer, the number of heads in multi-head attention. + bottleneck_expansion: A float, the channel expansion ratio for the + bottleneck. + key_expansion: A float, the channel expansion ratio for keys. + value_expansion: A float, the channel expansion ratio for values. + feed_forward_network_channels: An integer, the number of channels for the + feed_forward_network. Zero means no feed_forward_network will be + applied. + use_memory_self_attention: A boolean, whether to apply the memory space + self-attention. + use_pixel2memory_feedback_attention: A boolean, whether to apply the + pixel2memory feedback attention. + transformer_activation: A string, type of activation function for + self-attention. Support 'sigmoid' and 'softmax'. + bn_layer: A tf.keras.layers.Layer that computes the normalization + (default: tf.keras.layers.BatchNormalization). + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + + Raises: + ValueError: If filters * key_expansion is not divisible by num_heads. + ValueError: If filters * value_expansion is not divisible by num_heads. + """ + super(DualPathTransformerLayer, self).__init__(name=name) + + bottleneck_channels = int(round(filters * bottleneck_expansion)) + total_key_depth = int(round(filters * key_expansion)) + total_value_depth = int(round(filters * value_expansion)) + + if total_key_depth % num_heads: + raise ValueError('Total_key_depth should be divisible by num_heads.') + + if total_value_depth % num_heads: + raise ValueError('Total_value_depth should be divisible by num_heads.') + + # Compute query key value with one convolution and a batch norm layer. The + # initialization std is standard transformer initialization (without batch + # norm), as used in SASA and ViT. In our case, we use batch norm by default, + # so it does not require careful tuning. If one wants to remove all batch + # norms in axial attention, this standard initialization should still be + # good, but a more careful initialization is encouraged. + initialization_std = bottleneck_channels ** -0.5 + + self._memory_conv1_bn_act = convolutions.Conv1D( + bottleneck_channels, 'memory_conv1_bn_act', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=activation, + conv_kernel_weight_decay=conv_kernel_weight_decay) + + self._pixel_conv1_bn_act = convolutions.Conv1D( + bottleneck_channels, 'pixel_conv1_bn_act', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=activation, + conv_kernel_weight_decay=conv_kernel_weight_decay) + + # We always compute the query for memory space, since it gathers information + # from the pixel space and thus cannot be removed. We compute the key and + # value for memory space only when they are necessary (i.e. either + # use_memory_self_attention or use_pixel2memory_feedback_attention). + if use_memory_self_attention or use_pixel2memory_feedback_attention: + self._memory_qkv_conv_bn = convolutions.Conv1D( + total_key_depth * 2 + total_value_depth, 'memory_qkv_conv_bn', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='none', + conv_kernel_weight_decay=conv_kernel_weight_decay, + kernel_initializer=tf.keras.initializers.TruncatedNormal( + stddev=initialization_std)) + else: + # Compute memory query only if memory key and value are not used. + self._memory_query_conv_bn = convolutions.Conv1D( + total_key_depth, 'memory_query_conv_bn', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='none', + conv_kernel_weight_decay=conv_kernel_weight_decay, + kernel_initializer=tf.keras.initializers.TruncatedNormal( + stddev=initialization_std)) + + # For the pixel space, we always compute the key and value, since they + # provide information for the memory space and thus cannot be removed. We + # compute the query for pixel space only when it is necessary (i.e. + # use_pixel2memory_feedback_attention is True). + if use_pixel2memory_feedback_attention: + self._pixel_qkv_conv_bn = convolutions.Conv1D( + total_key_depth * 2 + total_value_depth, 'pixel_qkv_conv_bn', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='none', + conv_kernel_weight_decay=conv_kernel_weight_decay, + kernel_initializer=tf.keras.initializers.TruncatedNormal( + stddev=initialization_std)) + else: + self._pixel_kv_conv_bn = convolutions.Conv1D( + total_key_depth + total_value_depth, 'pixel_kv_conv_bn', + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='none', + conv_kernel_weight_decay=conv_kernel_weight_decay, + kernel_initializer=tf.keras.initializers.TruncatedNormal( + stddev=initialization_std)) + self._memory_attention = AttentionOperation( + 'memory_attention', activation, transformer_activation, + bn_layer=bn_layer) + if use_pixel2memory_feedback_attention: + self._pixel_attention = AttentionOperation( + 'pixel_attention', activation, transformer_activation, + bn_layer=bn_layer) + + self._use_memory_self_attention = use_memory_self_attention + self._use_pixel2memory_feedback_attention = ( + use_pixel2memory_feedback_attention) + self._total_key_depth = total_key_depth + self._total_value_depth = total_value_depth + self._num_heads = num_heads + self._bn_layer = bn_layer + self._conv_kernel_weight_decay = conv_kernel_weight_decay + self._activation = activation + self._activation_fn = activations.get_activation(activation) + self._feed_forward_network_channels = feed_forward_network_channels + + def build(self, input_shape_list): + pixel_shape, memory_shape = input_shape_list[:2] + # Here we follow ResNet bottleneck blocks: we apply a batch norm with gamma + # initialized at zero, followed by drop path and an activation function. + # Initializing this gamma at zero ensures that at random initialization of + # the model, the skip connections dominate all residual blocks. In this way, + # all the skip connections construct an identity mapping that passes the + # gradients (without any distortion from the randomly initialized blocks) to + # all residual blocks. This helps training at early epochs. + # Reference: "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour". + # https://arxiv.org/abs/1706.02677 + self._memory_conv3_bn = convolutions.Conv1D( + memory_shape[-1], 'memory_conv3_bn', + use_bias=False, + use_bn=True, + bn_layer=self._bn_layer, + bn_gamma_initializer='zeros', + activation='none', + conv_kernel_weight_decay=self._conv_kernel_weight_decay) + + if self._feed_forward_network_channels > 0: + self._memory_ffn_conv1_bn_act = convolutions.Conv1D( + self._feed_forward_network_channels, 'memory_ffn_conv1_bn_act', + use_bias=False, + use_bn=True, + bn_layer=self._bn_layer, + activation=self._activation, + conv_kernel_weight_decay=self._conv_kernel_weight_decay) + # Again, we follow ResNet bottleneck blocks: we apply a batch norm with + # gamma initialized at zero, followed by drop path and an activation + # function. + self._memory_ffn_conv2_bn = convolutions.Conv1D( + memory_shape[-1], 'memory_ffn_conv2_bn', + use_bias=False, + use_bn=True, + bn_layer=self._bn_layer, + bn_gamma_initializer='zeros', + activation='none', + conv_kernel_weight_decay=self._conv_kernel_weight_decay) + if self._use_pixel2memory_feedback_attention: + self._pixel_conv3_bn = convolutions.Conv1D( + pixel_shape[-1], 'pixel_conv3_bn', + use_bias=False, + use_bn=True, + bn_layer=self._bn_layer, + bn_gamma_initializer='zeros', + activation='none', + conv_kernel_weight_decay=self._conv_kernel_weight_decay) + + def call(self, inputs): + """Performs a forward pass. + + We have to define drop_path_masks outside the layer call and pass it into + the layer call, because recompute_grad (gradient checkpointing) does not + allow any randomness within the function call. In addition, recompute_grad + only supports float tensors as inputs. For this reason, the training flag + should be also passed as a float tensor. For the same reason, we cannot + support passing drop_path_random_mask as None. Instead, we ask the users to + pass only the first two tensors when drop path is not used. + + Args: + inputs: A tuple of 3 or 6 tensors, containing + pixel_space_input should be a [batch, num_pixel, pixel_space_channels] + tensor. + memory_space_input should be a [batch, num_memory, + memory_space_channels] tensor. + float_tensor_training should be a float tensor of 0.0 or 1.0, whether + the model is in training mode. + (optional) pixel_space_drop_path_mask is a drop path mask tensor of + shape [batch, 1, 1] for the pixel space. + (optional) memory_space_attention_drop_path_mask is a drop path mask + tensor of shape [batch, 1, 1] for the memory space. + (optional) memory_space_feed_forward_network_drop_path_mask is a drop + path mask tensor of shape [batch, 1, 1] for the memory space feed + forward network. + + Returns: + pixel_space_output: A [batch, num_pixel, pixel_space_channels] tensor. + activated_pixel_space_output: A [batch, num_pixel, pixel_space_channels] + tensor, activated pixel_space_output. + memory_space_output: A [batch, num_memory, memory_space_channels] + tensor. + + Raises: + ValueError: If the length of inputs is not 3 or 6. + """ + if len(inputs) not in (3, 6): + raise ValueError('The length of inputs should be either 3 or 6.') + + # Unpack the inputs. + (pixel_space_input, memory_space_input, float_tensor_training, + pixel_space_drop_path_mask, memory_space_attention_drop_path_mask, + memory_space_feed_forward_network_drop_path_mask) = ( + utils.pad_sequence_with_none(inputs, target_length=6)) + + # Recompute_grad takes only float tensors as inputs. It does not allow + # bools or boolean tensors. For this reason, we cast training to a float + # tensor outside this call, and now we cast it back to a boolean tensor. + training = tf.cast(float_tensor_training, tf.bool) + + # Decode the inputs shapes. + pixel_shape = pixel_space_input.get_shape().as_list() + memory_shape = memory_space_input.get_shape().as_list() + + # Similar to the ResNet bottleneck design, we do an input down projection + # in both the pixel space and the memory space. + memory_space = self._memory_conv1_bn_act(memory_space_input, + training=training) + + # Pixel space input is not activated. + pixel_space = self._pixel_conv1_bn_act( + self._activation_fn(pixel_space_input), training=training) + + if (self._use_memory_self_attention or + self._use_pixel2memory_feedback_attention): + memory_space_qkv = self._memory_qkv_conv_bn(memory_space, + training=training) + # Split, reshape, and transpose the query, key, and value. + memory_query, memory_key, memory_value = ( + tf.split(memory_space_qkv, [ + self._total_key_depth, self._total_key_depth, + self._total_value_depth], axis=-1)) + memory_key = utils.reshape_and_transpose_for_attention_operation( + memory_key, self._num_heads) + memory_value = tf.reshape(memory_value, [ + -1, memory_shape[1], self._num_heads, + self._total_value_depth // self._num_heads]) + else: + # Compute memory query only if memory key and value are not used. + memory_query = self._memory_query_conv_bn(memory_space, + training=training) + # Reshape and transpose the query. + memory_query = utils.reshape_and_transpose_for_attention_operation( + memory_query, self._num_heads) + + if self._use_pixel2memory_feedback_attention: + pixel_space_qkv = self._pixel_qkv_conv_bn(pixel_space, + training=training) + # Split the query, key, and value. + pixel_query, pixel_key, pixel_value = tf.split( + pixel_space_qkv, [ + self._total_key_depth, self._total_key_depth, + self._total_value_depth], axis=-1) + pixel_query = utils.reshape_and_transpose_for_attention_operation( + pixel_query, self._num_heads) + else: + pixel_space_kv = self._pixel_kv_conv_bn(pixel_space, training=training) + # Split the key and the value. + pixel_key, pixel_value = tf.split(pixel_space_kv, [ + self._total_key_depth, self._total_value_depth], axis=-1) + # Reshape and transpose the key and the value. + pixel_key = utils.reshape_and_transpose_for_attention_operation( + pixel_key, self._num_heads) + pixel_value = tf.reshape(pixel_value, [ + -1, pixel_shape[1], self._num_heads, + self._total_value_depth // self._num_heads]) + + # Compute memory space attention. + if not self._use_memory_self_attention: + # If memory self attention is not used, then only memory2pixel cross + # attention is used for the memory space. In this case, the key and the + # value are simply pixel_key and pixel_value. + memory_attention_key = pixel_key + memory_attention_value = pixel_value + else: + # If we also use memory self attention, the key and the value are the + # concatenation of keys and values in both the pixel space and the + # memory space. + memory_attention_key = tf.concat([pixel_key, memory_key], axis=2) + memory_attention_value = tf.concat([pixel_value, memory_value], axis=1) + + memory_space = self._memory_attention( + (memory_query, memory_attention_key, memory_attention_value), + training=training) + memory_space = self._memory_conv3_bn(memory_space, training=training) + + if memory_space_attention_drop_path_mask is not None: + memory_space = memory_space * memory_space_attention_drop_path_mask + memory_space_output = self._activation_fn( + memory_space_input + memory_space) + + # Apply an optional feed-forward network to the memory space. + if self._feed_forward_network_channels > 0: + memory_space = self._memory_ffn_conv1_bn_act(memory_space_output, + training=training) + memory_space = self._memory_ffn_conv2_bn(memory_space, + training=training) + if memory_space_feed_forward_network_drop_path_mask is not None: + memory_space = (memory_space * + memory_space_feed_forward_network_drop_path_mask) + memory_space_output = self._activation_fn( + memory_space_output + memory_space) + + # Compute pixel space attention and the output projection only when + # pixel2memory_feedback_attention is used. + if self._use_pixel2memory_feedback_attention: + pixel_space = self._pixel_attention( + (pixel_query, memory_key, memory_value), training=training) + pixel_space = self._pixel_conv3_bn(pixel_space, training=training) + if pixel_space_drop_path_mask is not None: + pixel_space = pixel_space * pixel_space_drop_path_mask + pixel_space_output = pixel_space_input + pixel_space + else: + # If pixel2memory_feedback_attention is not used, the pixel_space_input + # is not changed. + pixel_space_output = pixel_space_input + activated_pixel_space_output = self._activation_fn(pixel_space_output) + + # Return the pixel space output and memory space output. Note that we + # return pixel sapce output with and without the activation function, + # because our decoder might use non-activated features. + return (pixel_space_output, + activated_pixel_space_output, + memory_space_output) diff --git a/model/layers/dual_path_transformer_test.py b/model/layers/dual_path_transformer_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9b2fc42c992188af73bd2974f8198b86ecc6da93 --- /dev/null +++ b/model/layers/dual_path_transformer_test.py @@ -0,0 +1,67 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for transformer_layers.""" + +import tensorflow as tf + +from deeplab2.model.layers import dual_path_transformer + + +class TransformerLayersTest(tf.test.TestCase): + + def test_default_attention_operation_output_shape(self): + layer = dual_path_transformer.AttentionOperation( + 'attention', 'relu', 'softmax') + output = layer((tf.zeros([2, 8, 4225, 127]), + tf.zeros([2, 8, 422, 127]), + tf.zeros([2, 422, 8, 128]))) + self.assertListEqual(output.get_shape().as_list(), [2, 4225, 1024]) + + def test_default_transformer_layer_output_shape(self): + layer = dual_path_transformer.DualPathTransformerLayer() + float_training_tensor = tf.constant(0.0, dtype=tf.float32) + output = layer((tf.zeros([2, 4225, 126]), + tf.zeros([2, 127, 128]), + float_training_tensor)) + self.assertListEqual(output[0].get_shape().as_list(), [2, 4225, 126]) + self.assertListEqual(output[1].get_shape().as_list(), [2, 4225, 126]) + self.assertListEqual(output[2].get_shape().as_list(), [2, 127, 128]) + + def test_zero_feed_forward_network_output_shape(self): + layer = dual_path_transformer.DualPathTransformerLayer( + feed_forward_network_channels=0) + float_training_tensor = tf.constant(0.0, dtype=tf.float32) + output = layer((tf.zeros([2, 4225, 128]), + tf.zeros([2, 128, 128]), + float_training_tensor)) + self.assertListEqual(output[0].get_shape().as_list(), [2, 4225, 128]) + self.assertListEqual(output[1].get_shape().as_list(), [2, 4225, 128]) + self.assertListEqual(output[2].get_shape().as_list(), [2, 128, 128]) + + def test_attention_types_output_shape(self): + layer = dual_path_transformer.DualPathTransformerLayer( + use_memory_self_attention=False, + use_pixel2memory_feedback_attention=False) + float_training_tensor = tf.constant(0.0, dtype=tf.float32) + output = layer((tf.zeros([2, 4225, 128]), + tf.zeros([2, 128, 128]), + float_training_tensor)) + self.assertListEqual(output[0].get_shape().as_list(), [2, 4225, 128]) + self.assertListEqual(output[1].get_shape().as_list(), [2, 4225, 128]) + self.assertListEqual(output[2].get_shape().as_list(), [2, 128, 128]) + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/positional_encodings.py b/model/layers/positional_encodings.py new file mode 100644 index 0000000000000000000000000000000000000000..b1db2a784dfaa6c4b9b64a7dfde6c8273f927a31 --- /dev/null +++ b/model/layers/positional_encodings.py @@ -0,0 +1,243 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implements relative [1, 2, 3] and global [3, 4] positional encodings. + +Our Axial-Deeplab [1] proposes position-sensitive self-attention which uses +relative positional encodings for query, key, and value. + +[1] Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, + ECCV 2020 Spotlight. + Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, + Liang-Chieh Chen. +[2] Self-Attention with Relative Position Representations, NAACL 2018. + Peter Shaw, Jakob Uszkoreit, Ashish Vaswani. +[3] Tensor2Tensor for Neural Machine Translation, arXiv 2018, + http://arxiv.org/abs/1803.07416. + Ashish Vaswani, Samy Bengio, Eugene Brevdo, Francois Chollet, + Aidan N. Gomez, Stephan Gouws, Llion Jones, Łukasz Kaiser, + Nal Kalchbrenner, Niki Parmar, Ryan Sepassi, Noam Shazeer, + Jakob Uszkoreit. +[4] Attention Is All You Need, NeurIPS 2017. + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, + Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin. +[5] An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale, + ICLR 2021. + Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, + Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, + Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. +""" + +import tensorflow as tf + +# MAX_SPAN defines the maximum shape of positional encoding. It is set as a +# large constant so that we can easily load and use models with global or +# different local spans, but it should not be too large so that it takes a +# reasonable amount of memory. The value 255 is larger than almost all span +# choices (e.g. 65 for local attention, 129, 193, etc.) so 255 is large enough. +# 257 will be a good choice for gpu, but 255 is more efficient on TPU which pads +# tensors to 128x. +MAX_SPAN = 255 + + +def _compute_relative_distance_matrix(query_length, key_length): + """Computes a relative distance matrix between queries and keys. + + We assume that the queries and the keys are centered, i.e., + key_length = memory_flange + query_length + memory_flange. + + The function is based on the _generate_relative_positions_matrix function in + common_attention.py of tensor2tensor codebase: + https://github.com/tensorflow/tensor2tensor/blob/5623deb79cfcd28f8f8c5463b58b5bd76a81fd0d/tensor2tensor/layers/common_attention.py#L1670 + + Args: + query_length: An integer, the length of queries. + key_length: An integer, the length of keys. + + Returns: + distance_matrix: A [query_length, key_length] tensor. + + Raises: + ValueError: If (key_length - query_length) is odd, i.e., the assumption does + not hold. + """ + if (key_length - query_length) % 2: + raise ValueError('Key_length should be query_length + 2 * memory_flange.') + key_index = tf.range(key_length) + query_index = tf.range(query_length) + (key_length - query_length) // 2 + distance_matrix = key_index[None, :] - query_index[:, None] + # Shift the distance_matrix so that it is >= 0. Each entry of the + # distance_matrix distance will index a relative positional embedding. + distance_matrix = distance_matrix + MAX_SPAN - 1 + if query_length + (key_length - query_length) // 2 > MAX_SPAN: + tf.logging.warn('Axial attention span is larger than MAX_SPAN. In this ' + 'case, we use a single shared embedding for all positions ' + 'beyond this relative distance. Please make sure, this ' + 'behavior is intended.') + distance_matrix = tf.clip_by_value(distance_matrix, 0, MAX_SPAN * 2 - 2) + return distance_matrix + + +class RelativePositionalEncoding(tf.keras.layers.Layer): + """Generates relative positional encoding. + + The function is based on the _generate_relative_positions_embeddings function + in common_attention.py of tensor2tensor codebase: + https://github.com/tensorflow/tensor2tensor/blob/5623deb79cfcd28f8f8c5463b58b5bd76a81fd0d/tensor2tensor/layers/common_attention.py#L1691 + """ + + def __init__(self, query_length, key_length, depth, num_heads, name, + initialization_std=1.0, conv_kernel_weight_decay=0.0): + """Initializes a relative position encoding layer. + + Args: + query_length: An integer, the length of queries. + key_length: An integer, the length of keys. + depth: An integer, the number of embedding channels per head. + num_heads: An integer, the number of heads in multi-head attention. + name: A string, the name of the embedding. + initialization_std: A float, the initialization std for the embedding. + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + + Returns: + output: A [num_heads, query, key, depth] tensor, the relative positional + encodings for each head and each query-key-pair. + """ + super(RelativePositionalEncoding, self).__init__(name=name) + self._initializer = tf.keras.initializers.TruncatedNormal( + stddev=initialization_std) + self._regularizer = tf.keras.regularizers.l2(conv_kernel_weight_decay) + + self._relative_distance_matrix = _compute_relative_distance_matrix( + query_length, key_length) + self._num_heads = num_heads + self._embedding_shape = (MAX_SPAN * 2 - 1, depth) + + def build(self, input_shape): + """Builds the embedding weight.""" + del input_shape + self._embeddings = self.add_weight( + shape=self._embedding_shape, + initializer=self._initializer, trainable=True, + name='embeddings', + regularizer=self._regularizer) + + def call(self, inputs): + """A forward pass that gathers the relative positional encoding.""" + del inputs + # Gather the embeddings according to the relative distances. + embeddings = tf.gather(self._embeddings, self._relative_distance_matrix) + return tf.tile(tf.expand_dims(embeddings, axis=0), + [self._num_heads, 1, 1, 1]) + + +class AddAbsolutePositionalEncoding(tf.keras.layers.Layer): + """Adds a learnable absolute positional encoding to the input feature. + + Supports both 1D and 2D versions of the positional encoding: (1) 1D positional + encoding represents each row index with an embedding, and represents each + column index with another embedding. This results in a total of (height + + width) learnable embedding vectors. (2) 2D positional encoding adds + independent embeddings to each input grid position. This choice uses a total + of (height * width) learnable embedding vectors. + """ + + def __init__(self, name, positional_encoding_type=None, + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0): + """Initializes an AddAbsolutePositionEmbedding layer. + + Args: + name: A string specifying the name of the layer. + positional_encoding_type: A string, type of the positional encoding. + Support '2D', '1D', 'none', and None. The feature is returned as is if + positional_encoding_type is 'none' or None. + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + + Raises: + ValueError: If positional_encoding_type is not one of '1D', '2D', 'none', + and None. + """ + super(AddAbsolutePositionalEncoding, self).__init__(name=name) + if not any([positional_encoding_type is None, + positional_encoding_type.lower() == 'none', + positional_encoding_type.lower() == '2d', + positional_encoding_type.lower() == '1d']): + raise ValueError(positional_encoding_type + ' is not supported.') + self._positional_encoding_type = positional_encoding_type + # This initialization std is tuned for global attention, but it does not + # seem to be a sensitive hyper-parameter, since we use batch norm on the + # positional encodings. + self._initializer = tf.keras.initializers.TruncatedNormal(stddev=0.2) + self._kernel_regularizer = tf.keras.regularizers.l2( + conv_kernel_weight_decay) + self._bn_layer = bn_layer + + def build(self, input_shape): + """Builds the layer weights whose shape depends on the 4D input shape.""" + _, height, width, channel = input_shape + if self._positional_encoding_type.lower() == '2d': + self._embeddings = self.add_weight( + shape=(1, height, width, channel), + initializer=self._initializer, trainable=True, + name='embeddings', + regularizer=self._kernel_regularizer) + self._batch_norm = self._bn_layer(axis=-1, name='batch_norm') + elif self._positional_encoding_type.lower() == '1d': + # Generate separable positional encodings for the height axis and the + # width axis. + self._height_axis_embeddings = self.add_weight( + shape=(1, height, 1, channel), + initializer=self._initializer, trainable=True, + name='height_axis_embeddings', + regularizer=self._kernel_regularizer) + self._height_axis_batch_norm = self._bn_layer( + axis=-1, name='height_axis_batch_norm') + self._width_axis_embeddings = self.add_weight( + shape=(1, height, 1, channel), + initializer=self._initializer, trainable=True, + name='width_axis_embeddings', + regularizer=self._kernel_regularizer) + self._width_axis_batch_norm = self._bn_layer( + axis=-1, name='width_axis_batch_norm') + + def call(self, features, training=False): + """Performs a forward pass. + + Args: + features: An input [batch, height, width, channels] tensor. + training: A boolean, whether the model is in training mode. + + Returns: + output: The sum of the input feature and learnable positional encodings. + """ + if (self._positional_encoding_type is None or + self._positional_encoding_type.lower() == 'none'): + return features + elif self._positional_encoding_type.lower() == '2d': + positional_encoding = self._batch_norm(self._embeddings, + training=training) + elif self._positional_encoding_type.lower() == '1d': + height_axis_positional_encoding = self._height_axis_batch_norm( + self._height_axis_embeddings, training=training) + width_axis_positional_encoding = self._width_axis_batch_norm( + self._width_axis_embeddings, training=training) + positional_encoding = (height_axis_positional_encoding + + width_axis_positional_encoding) + return features + positional_encoding diff --git a/model/layers/positional_encodings_test.py b/model/layers/positional_encodings_test.py new file mode 100644 index 0000000000000000000000000000000000000000..05d78b55e42a2acab5dccdd49f00664d9aecf4cb --- /dev/null +++ b/model/layers/positional_encodings_test.py @@ -0,0 +1,60 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for positional_encodings.""" + +import tensorflow as tf + +from deeplab2.model.layers import positional_encodings + + +class PositionalEncodingsTest(tf.test.TestCase): + + def test_compute_relative_distance_matrix_output_shape(self): + output = positional_encodings._compute_relative_distance_matrix(33, 97) + self.assertListEqual(output.get_shape().as_list(), [33, 97]) + + def test_relative_positional_encoding_output_shape(self): + layer = positional_encodings.RelativePositionalEncoding( + 33, 97, 32, 8, 'rpe') + output = layer(None) + self.assertListEqual(output.get_shape().as_list(), [8, 33, 97, 32]) + + def test_add_absolute_positional_encoding_1d_output_shape(self): + layer = positional_encodings.AddAbsolutePositionalEncoding( + 'ape1d', positional_encoding_type='1d') + shape = [2, 5, 5, 3] + output = layer(tf.zeros(shape)) + self.assertEqual(len(layer.get_weights()), 10) + self.assertListEqual(output.get_shape().as_list(), shape) + + def test_add_absolute_positional_encoding_2d_output_shape(self): + layer = positional_encodings.AddAbsolutePositionalEncoding( + 'ape2d', positional_encoding_type='2d') + shape = [2, 5, 5, 3] + output = layer(tf.zeros(shape)) + self.assertEqual(len(layer.get_weights()), 5) + self.assertListEqual(output.get_shape().as_list(), shape) + + def test_add_absolute_positional_encoding_none_output_shape(self): + layer = positional_encodings.AddAbsolutePositionalEncoding( + 'none', positional_encoding_type='none') + shape = [2, 5, 5, 3] + output = layer(tf.zeros(shape)) + self.assertEqual(len(layer.get_weights()), 0) + self.assertListEqual(output.get_shape().as_list(), shape) + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/recompute_grad.py b/model/layers/recompute_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..8bf0e2ad66595e794b187cb7564669ce2ee6c19a --- /dev/null +++ b/model/layers/recompute_grad.py @@ -0,0 +1,289 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Library for rematerialization. + +Incubates a version of tf.recompute_grad that is XLA compatible. + +This file is based on the recompute_grad.py in the bigbird codebase [1]: +https://github.com/google-research/bigbird/blob/db06498ec8804c6438111938d8654b66ddaccd5d/bigbird/core/recompute_grad.py + +[1] Big Bird: Transformers for Longer Sequences, NeurIPS 2020. + Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris + Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li + Yang, Amr Ahmed. +""" +import collections +import os +import threading +from typing import Deque, List, NamedTuple, Optional, Sequence + +from absl import logging +import tensorflow.compat.v2 as tf + +# pylint: disable=g-direct-tensorflow-import +from tensorflow.python.framework import ops +from tensorflow.python.ops import custom_gradient + + +# Remove when https://github.com/tensorflow/tensorflow/pull/45298 +# gets merged +def get_variable_by_name(var_name): + """Retrieves tf.Variable from name in MirroredStrategy (multi-gpu).""" + + # Get all variables, but it will have copies from different replicas + all_global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) + + def _replica_filter(var): + """Filter out variables from different context.""" + try: + return var_name == var.op.name + except AttributeError: + return False + candidate_vars = list(filter(_replica_filter, all_global_vars)) + + if len(candidate_vars) >= 1: + # Filter out non-trainable variables. + candidate_vars = [v for v in candidate_vars if v.trainable] + else: + raise ValueError('Unsuccessful at finding variable {}.'.format(var_name)) + + if len(candidate_vars) == 1: + return candidate_vars[0] + elif len(candidate_vars) > 1: + raise ValueError( + 'Unsuccessful at finding trainable variable {}. ' + 'Number of candidates: {}. ' + 'Candidates: {}'.format(var_name, len(candidate_vars), candidate_vars)) + else: + # The variable is not trainable. + return None +custom_gradient.get_variable_by_name = get_variable_by_name + + +class RecomputeContext( + NamedTuple('RecomputeContext', [ + ('is_recomputing', bool), + ('seed', tf.Tensor), + ('children', Deque['RecomputeContext']), + ])): + """Context for recomputation. + + Attributes: + is_recomputing: Whether we are in a recomputation phase. + seed: Scalar integer tensor that should be used with stateless random ops + for deterministic behavior and correct computation of the gradient. + children: Nested `RecomputeContext` instances. Used internally by + `recompute_grad` to track nested instances of `RecomputeContext`. + """ + + def __enter__(self): + return _context_stack.push(self) + + def __exit__(self, exc_type, exc_value, traceback): + _context_stack.pop(self) + + +# Simplified version of `_DefaultStack` in +# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/ops.py. +class _ContextStack(threading.local): + """A thread-local stack for providing implicit recompute contexts.""" + + def __init__(self): + super(_ContextStack, self).__init__() + self._stack = [] + + def top(self) -> Optional[RecomputeContext]: + return self._stack[-1] if self._stack else None + + def push(self, context: RecomputeContext): + self._stack.append(context) + return context + + def pop(self, context: RecomputeContext): + if self._stack[-1] is not context: + raise AssertionError('Nesting violated for RecomputeContext.') + self._stack.pop() + + +_context_stack = _ContextStack() + + +def get_recompute_context() -> Optional[RecomputeContext]: + """Returns the current recomputing context if it exists.""" + return _context_stack.top() + + +# Adapted from +# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/control_flow_util.py. +def _get_containing_xla_context(graph: tf.Graph) -> Optional[object]: + """Returns the first ancestor `XLAControlFlowContext` in the `graph`.""" + ctxt = graph._get_control_flow_context() # pylint: disable=protected-access + while ctxt: + if ctxt.IsXLAContext(): + return ctxt + ctxt = ctxt.outer_context + return None + + +def _in_xla_context(graph: Optional[tf.Graph] = None) -> bool: + """Detects whether we are in an XLA context.""" + if '--tf_xla_auto_jit=2' in os.environ.get('TF_XLA_FLAGS', ''): + return True + graph = tf.compat.v1.get_default_graph() if graph is None else graph + while True: + if _get_containing_xla_context(graph) is not None: + return True + try: + graph = graph.outer_graph + except AttributeError: + return False + + +def _force_data_dependency( + first_compute: Sequence[tf.Tensor], + then_compute: Sequence[tf.Tensor]) -> List[tf.Tensor]: + """Forces all of `then_compute` to depend on all of `first_compute`. + + Uses a dummy data dependency, which is useful when running on TPUs because + XLA ignores control dependencies. Only supports float arguments. + + Args: + first_compute: Sequence of `Tensor`s to be executed before `then_compute`. + then_compute: Sequence of `Tensor`s to executed after `first_compute`. + + Returns: + Sequence of `Tensor`s with same length of `then_compute`. + + Raises: + ValueError: if ranks are unknown or types are not floating. + """ + + def _first_element(x): + if x.shape.ndims is None: + raise ValueError('Rank of Tensor %s must be known' % x) + ndims = x.shape.ndims + begin = tf.zeros(ndims, dtype=tf.int32) + size = tf.ones(ndims, dtype=tf.int32) + return tf.reshape(tf.slice(x, begin, size), []) + + first_compute_sum = tf.add_n( + [_first_element(x) for x in first_compute if x is not None]) + dtype = first_compute_sum.dtype + if not dtype.is_floating: + raise ValueError('_force_data_dependency only supports floating dtypes.') + zero = tf.cast(0.0, first_compute_sum.dtype) * first_compute_sum + then_compute_sequence = [ + x + tf.cast(zero, x.dtype) if x is not None else None + for x in tf.nest.flatten(then_compute) + ] + return tf.nest.pack_sequence_as(then_compute, then_compute_sequence) + + +def _make_seed_if_none(seed: Optional[tf.Tensor]) -> tf.Tensor: + """Uses the global generator to make a seed if necessary.""" + if seed is not None: + return seed + generator = tf.random.experimental.get_global_generator() + # The two seeds for stateless random ops don't have individual semantics and + # are scrambled together, so providing one seed is fine. This makes it easier + # for users to provide a local seed without worrying about integer overflow. + # See `make_seeds` in + # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/stateful_random_ops.py. + try: + return generator.uniform_full_int([], tf.int32, name='recompute_grad_seed') + except (RuntimeError, TypeError, ValueError, tf.errors.NotFoundError) as e: + # For a number of reasons, the above operation can fail like using multiple + # graphs or toggling between eager and graph modes. Reset the generator. + logging.warn('Resetting the generator. %s: %s', type(e), e) + tf.random.experimental.set_global_generator(None) + generator = tf.random.experimental.get_global_generator() + return generator.uniform_full_int([], tf.int32, name='recompute_grad_seed') + + +def recompute_grad(f, seed=None): + """An eager-compatible version of recompute_grad. + + For f(*args, **kwargs), this supports gradients with respect to args, or to + gradients with respect to any variables residing in the kwarg 'variables'. + Note that for keras layer and model objects, this is handled automatically. + + Warning: If `f` was originally a tf.keras Model or Layer object, `g` will not + be able to access the member variables of that object, because `g` returns + through the wrapper function `inner`. When recomputing gradients through + objects that inherit from keras, we suggest keeping a reference to the + underlying object around for the purpose of accessing these variables. + + Args: + f: function `f(*x)` that returns a `Tensor` or sequence of `Tensor` outputs. + seed: Optional seed for random ops. `seed` should an integer scalar + `Tensor`. When compiling to XLA, `seed` must have dtype `tf.int32`. If + `seed` is not provided one will be generated. + + Returns: + A function `g` that wraps `f`, but which recomputes `f` on the backwards + pass of a gradient call. + """ + + @tf.custom_gradient + def inner(*args, **kwargs): + """Inner function closure for calculating gradients.""" + # Detect when we're nested and in the backwards pass, so we don't generate + # an additional seed. + parent_context = get_recompute_context() + if parent_context is not None and parent_context.is_recomputing: + # Use the cached context in the recomputation phase. + with parent_context.children.popleft()._replace( + is_recomputing=True) as context: + result = f(*args, **kwargs) + else: + with RecomputeContext( + is_recomputing=False, + seed=_make_seed_if_none(seed), + children=collections.deque()) as context: + result = f(*args, **kwargs) + # In the forward pass, build up a tree of recomputation contexts. + if parent_context is not None and not parent_context.is_recomputing: + parent_context.children.append(context) + + def grad(*dresult, **grad_kwargs): + """Gradient function calculation for inner function.""" + variables = grad_kwargs.pop('variables', None) + if grad_kwargs: + raise ValueError('Found unexpected kwargs for `grad`: ', + list(grad_kwargs.keys())) + inputs, seed = list(args), context.seed + if _in_xla_context(): + inputs = _force_data_dependency( + tf.nest.flatten(dresult), inputs + [seed]) + seed = inputs.pop() + # tf.keras.backend.set_learning_phase(1) + with tf.GradientTape() as tape: + tape.watch(inputs) + if variables is not None: + tape.watch(variables) + with tf.control_dependencies(dresult): + with context._replace(is_recomputing=True, seed=seed): + result = f(*inputs, **kwargs) + kw_vars = [] + if variables is not None: + kw_vars = list(variables) + grads = tape.gradient( + result, list(inputs) + kw_vars, output_gradients=dresult) + return grads[:len(inputs)], grads[len(inputs):] + + return result, grad + + return inner diff --git a/model/layers/recompute_grad_test.py b/model/layers/recompute_grad_test.py new file mode 100644 index 0000000000000000000000000000000000000000..d488b2e900ff29454b3aa342ea71bb6ccc8e3c84 --- /dev/null +++ b/model/layers/recompute_grad_test.py @@ -0,0 +1,254 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for recompute_grad. + +This file is based on the recompute_grad_test.py in the etcmodel codebase [1]: +https://github.com/google-research/google-research/blob/ae9d07f22d31b36069bb8321e9d015e46dd8e8bb/etcmodel/layers/recompute_grad_test.py + +[1] ETC: Encoding Long and Structured Inputs in Transformers, EMNLP 2020. + Joshua Ainslie, Santiago Ontanon, Chris Alberti, Vaclav Cvicek, Zachary + Fisher, Philip Pham, Anirudh Ravula, Sumit Sanghai, Qifan Wang, Li Yang. +""" +from typing import Sequence +import tensorflow as tf +from deeplab2.model import test_utils +from deeplab2.model.encoder import axial_resnet_instances +from deeplab2.model.layers import recompute_grad as recompute_grad_lib + + +def _compute_deeplab_gradients(inputs, model, training): + """Returns an output and all the gradients.""" + variables = model.trainable_weights[::-1] + [inputs] + with tf.GradientTape(persistent=True) as tape: + tape.watch(variables) + out = model(inputs, training=training)['transformer_mask_feature'] + + grads = tape.gradient(out, variables) + return out, grads + + +class RecomputeGradTest(tf.test.TestCase): + + def test_real_deeplab_models(self): + resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='') + tf.config.experimental_connect_to_cluster(resolver) + tf.tpu.experimental.initialize_tpu_system(resolver) + strategy = tf.distribute.TPUStrategy(resolver) + + with strategy.scope(): + # Test max_deeplab_s since it involves all three types of operations: + # convolution, axial-attention, and transformer. + model_name = 'max_deeplab_s' + kwargs = {'num_blocks': [1, 1, 1, 1], + 'backbone_layer_multiplier': 1, + 'width_multiplier': 1.0, + 'bn_layer': tf.keras.layers.experimental.SyncBatchNormalization, + 'conv_kernel_weight_decay': 0.0, + 'block_group_config': { + 'drop_path_keep_prob': 1.0, # Disable the randomness. + 'conv_use_recompute_grad': False, + 'axial_use_recompute_grad': False, + 'recompute_within_stride': 0, + 'transformer_use_recompute_grad': False}} + # Build test input. + tensor = test_utils.create_test_input(1, 33, 33, 3) + test_input = tf.Variable(tensor) + test_input_recompute = tf.Variable(tensor) + + # Build a model. + model = axial_resnet_instances.get_model(model_name, **kwargs) + model(test_input, training=True) + + # Set the batch norm gamma as non-zero so that the bottleneck computation + # affects the output. + for weight in model.trainable_weights: + if '/gamma:0' in weight.name: + weight.assign(tf.ones_like(weight) * 0.1) + + # Activate all recompute_grad for the recomputed model. + kwargs['block_group_config'] = { + 'drop_path_keep_prob': 1.0, + 'conv_use_recompute_grad': True, + 'axial_use_recompute_grad': True, + 'recompute_within_stride': 0, + 'transformer_use_recompute_grad': True} + + # Build the same model but with recompute_grad. + model_recompute = axial_resnet_instances.get_model(model_name, **kwargs) + model_recompute(test_input_recompute, training=True) + model_recompute.set_weights(model.get_weights()) + + @tf.function + def function(): + outs_recompute, grads_recompute = _compute_deeplab_gradients( + test_input_recompute, model_recompute, True) + outs, grads = _compute_deeplab_gradients( + test_input, model, True) + return grads_recompute, grads, outs_recompute, outs + + grads_recompute, grads, outs_recompute, outs = strategy.run(function) + + # Similar outputs. + self.assertAllClose(outs.values[0], outs_recompute.values[0], + rtol=1e-4, atol=1e-4) + + # Similar gradients. + for grad, grad_recompute in zip(grads, grads_recompute): + if grad is None or grad_recompute is None: + continue + grad = grad.values[0] + grad_recompute = grad_recompute.values[0] + if (isinstance(grad, tf.IndexedSlices) and + isinstance(grad_recompute, tf.IndexedSlices)): + continue + self.assertAllClose(grad, grad_recompute, rtol=1e-1, atol=1e-1) + + +def _compute_gradients(model, x): + with tf.GradientTape() as tape: + y = model(x) + return tape.gradient( + y, model.trainable_variables + if hasattr(model, 'trainable_variables') else tape.watched_variables()) + + +def _make_gradients_op(model, x): + f = lambda x: _compute_gradients(model, x) + return (tf.function(experimental_compile=True)(lambda: f(x)) + if tf.executing_eagerly() else tf.compat.v1.tpu.rewrite(f, (x,))) + + +class RecomputeDense(tf.keras.layers.Layer): + """Dense layer that recomputes the forward pass during backpropagation.""" + + def __init__(self, units: Sequence[int], **kwargs): + super(RecomputeDense, self).__init__(**kwargs) + self._units = tf.nest.flatten(units) + + def build(self, input_shape: tf.TensorShape): + units = input_shape[-1:] + self._units + kernels = [] + biases = [] + for i in range(1, len(units)): + kernels.append( + self.add_weight('kernel_{}'.format(i), (units[i - 1], units[i]))) + biases.append(self.add_weight('bias_{}'.format(i), (units[i],))) + self._kernels = kernels + self._biases = biases + super(RecomputeDense, self).build(input_shape) + + def call(self, inputs: tf.Tensor, **kwargs): + + @recompute_grad_lib.recompute_grad + def f(x): + for kernel, bias in zip(self._kernels, self._biases): + x = tf.nn.tanh(tf.matmul(x, kernel) + bias) + return x + + return f(inputs) + + +class RecomputeDense2Args(RecomputeDense): + """Extension of `RecomputeDense` that takes and returns 2 arguments.""" + + def build(self, input_shape: Sequence[tf.TensorShape]): + super(RecomputeDense2Args, self).build(input_shape[0]) + + def call(self, inputs: Sequence[tf.Tensor], **kwargs): + + @recompute_grad_lib.recompute_grad + def f(x1, x2): + for kernel, bias in zip(self._kernels, self._biases): + x1 = tf.nn.tanh(tf.matmul(x1, kernel) + bias) + for kernel, bias in zip(self._kernels, self._biases): + x2 = tf.nn.tanh(tf.matmul(x2, kernel) + bias) + return x1, x2 + + return f(*inputs) + + +class RecomputeGradXlaTest(tf.test.TestCase): + """Tests for recompute_grad_lib.recompute_grad with XLA.""" + + @property + def device(self): + if tf.config.list_logical_devices('TPU'): + return sorted(tf.config.list_logical_devices('TPU'))[0] + elif tf.config.list_logical_devices('GPU'): + return sorted(tf.config.list_logical_devices('GPU'))[0] + else: + return sorted(tf.config.list_logical_devices('CPU'))[0] + + def test_xla_model_correctness(self): + """Tests correctness of the gradient calculation.""" + + def _make_model(input_size): + inputs = tf.keras.Input((input_size,)) + x = inputs + for _ in range(2): + x = RecomputeDense([16] * 2)(x) + outputs = tf.keras.layers.Dense(1)(x) + return tf.keras.Model(inputs, outputs) + + with tf.device(self.device): + recompute_model = _make_model(4) + control_model = tf.keras.Sequential([ + tf.keras.layers.Dense(16, activation='tanh', input_shape=(4,)), + tf.keras.layers.Dense(16, activation='tanh'), + tf.keras.layers.Dense(16, activation='tanh'), + tf.keras.layers.Dense(16, activation='tanh'), + tf.keras.layers.Dense(1), + ]) + if not tf.executing_eagerly(): + self.evaluate(tf.compat.v1.tpu.initialize_system()) + self.evaluate(tf.compat.v1.initializers.global_variables()) + for source, target in zip(control_model.trainable_variables, + recompute_model.trainable_variables): + self.evaluate(target.assign(source)) + x = tf.ones((32, 4)) + actual_gradients = self.evaluate(_make_gradients_op(recompute_model, x)) + expected_gradients = self.evaluate(_make_gradients_op(control_model, x)) + for actual, expected in zip(actual_gradients, expected_gradients): + self.assertAllClose(actual, expected) + + def test_xla_model_2_argument_case(self): + """Tests for a recomputed function that takes and returns multiple args. + + We don't test correctness of the gradients here; we're just making sure + `recompute_grad` runs without error in this case. + """ + + def _make_model(input_size): + input1 = tf.keras.Input((input_size,)) + input2 = tf.keras.Input((input_size,)) + x = (input1, input2) + for _ in range(2): + x = RecomputeDense2Args([16] * 2)(x) + outputs = tf.keras.layers.Dense(1)(x[0] + x[1]) + return tf.keras.Model((input1, input2), outputs) + + with tf.device(self.device): + recompute_model = _make_model(4) + if not tf.executing_eagerly(): + self.evaluate(tf.compat.v1.tpu.initialize_system()) + self.evaluate(tf.compat.v1.initializers.global_variables()) + x1 = tf.ones((32, 4)) + x2 = 2 * tf.ones((32, 4)) + _ = self.evaluate(_make_gradients_op(recompute_model, (x1, x2))) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/resized_fuse.py b/model/layers/resized_fuse.py new file mode 100644 index 0000000000000000000000000000000000000000..bb68fdbf985b2c12600067f8f96d722032914012 --- /dev/null +++ b/model/layers/resized_fuse.py @@ -0,0 +1,165 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implements a resized feature fuser for stacked decoders in MaX-DeepLab. + +Reference: + MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers, + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. +""" + +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import activations +from deeplab2.model.layers import convolutions + + +class ResizedFuse(tf.keras.layers.Layer): + """Fuses features by resizing and 1x1 convolutions. + + This function fuses all input features to a desired shape, by projecting the + features to the desired number of channels, bilinear resizing the outputs + (either upsampling or downsampling), and finally adding the outputs. If the + input channel equals the desired output channels, the 1x1 convolutional + projection is skipped. If the projection and bilinear resizing can be fused + into a stride 2 convolution, we use this faster implementation. Other strides + are also supported with the bilinear resizing, but are probably slower than + strided convolutions. + + Reference: + MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers, + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + """ + + def __init__(self, + name, + height, + width, + num_channels, + activation='relu', + bn_layer=tf.keras.layers.BatchNormalization, + conv_kernel_weight_decay=0.0): + """Initializes a ResizedFuse layer. + + Args: + name: A string, the name of this layer. + height: An integer, the desired height of the output. + width: An integer, the desired width of the output. + num_channels: An integer, the num of output channels. + activation: A string, type of activation function to apply. + bn_layer: A tf.keras.layers.Layer that computes the normalization + (default: tf.keras.layers.BatchNormalization). + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + """ + super(ResizedFuse, self).__init__(name=name) + self._height = height + self._width = width + self._num_channels = num_channels + self._activation_fn = activations.get_activation(activation) + self._bn_layer = bn_layer + self._conv_kernel_weight_decay = conv_kernel_weight_decay + + def build(self, input_shapes): + for index, feature_shape in enumerate(input_shapes): + _, feature_height, feature_width, feature_channels = feature_shape + if feature_channels == self._num_channels: + continue + elif ((feature_height + 1) // 2 == self._height and + (feature_width + 1) // 2 == self._width): + # Use stride 2 convolution to accelerate the operation if it generates + # the desired spatial shape. Otherwise, the more general 1x1 convolution + # and bilinear resizing are applied. + + # In a stacked decoder, we follow relu-conv-bn because we do the feature + # summation before relu and after bn (following ResNet bottleneck + # design). This ordering makes it easier to implement. Besides, it + # avoids using many 1x1 convolutions when the input has a correct shape. + current_name = '_strided_conv_bn{}'.format(index + 1) + utils.safe_setattr( + self, current_name, convolutions.Conv2DSame( + self._num_channels, 1, current_name[1:], + strides=2, + use_bias=False, + use_bn=True, + bn_layer=self._bn_layer, + activation='none', + conv_kernel_weight_decay=self._conv_kernel_weight_decay)) + else: + # If the input channel does not match that of the output, and the + # operation cannot be accelerated by stride 2 convolution, then we + # perform a flexible operation as follows. We first project the feature + # to the desired number of channels, and then bilinearly resize the + # output to the desired spatial resolution. + current_name = '_resized_conv_bn{}'.format(index + 1) + utils.safe_setattr( + self, current_name, convolutions.Conv2DSame( + self._num_channels, 1, current_name[1:], + use_bias=False, + use_bn=True, + bn_layer=self._bn_layer, + activation='none', + conv_kernel_weight_decay=self._conv_kernel_weight_decay)) + + def call(self, inputs, training=False): + """Performs a forward pass. + + Args: + inputs: A list of input [batch, input_height, input_width, input_channels] + tensors to fuse, where each input tensor may have different spatial + resolutions and number of channels. + training: A boolean, whether the model is in training mode. + + Returns: + output: A fused feature [batch, height, width, num_channels] tensor. + """ + + output_features = [] + for index, feature in enumerate(inputs): + _, feature_height, feature_width, feature_channels = ( + feature.get_shape().as_list()) + if feature_channels == self._num_channels: + # Resize the input feature if the number of channels equals the output. + # We do not use a 1x1 convolution for this case because the previous + # operation and the next operation are usually also 1x1 convolutions. + # Besides, in stacked decoder, a feature can be reused many time, so it + # saves parameter to avoid those many 1x1 convolutions. + output_features.append(utils.resize_bilinear( + feature, [self._height, self._width], + align_corners=True)) + elif ((feature_height + 1) // 2 == self._height and + (feature_width + 1) // 2 == self._width): + current_name = '_strided_conv_bn{}'.format(index + 1) + feature = self._activation_fn(feature) + feature = getattr(self, current_name)(feature, training=training) + output_features.append(feature) + else: + current_name = '_resized_conv_bn{}'.format(index + 1) + feature = self._activation_fn(feature) + feature = getattr(self, current_name)(feature, training=training) + output_features.append(utils.resize_bilinear( + feature, [self._height, self._width], + align_corners=True)) + # Set the spatial shape of each output feature if possible. + output_features[-1].set_shape( + [None, + self._height, + self._width, + self._num_channels]) + output = tf.add_n(output_features) + return self._activation_fn(output) diff --git a/model/layers/resized_fuse_test.py b/model/layers/resized_fuse_test.py new file mode 100644 index 0000000000000000000000000000000000000000..3ba8431462e4bb5b4e714834bef2dbb97facdc46 --- /dev/null +++ b/model/layers/resized_fuse_test.py @@ -0,0 +1,56 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for resized_fuse.""" + +import tensorflow as tf + +from deeplab2.model.layers import resized_fuse + + +class ResizedFuseTest(tf.test.TestCase): + + def test_resize_and_fuse_features(self): + batch, height, width, channels = 2, 11, 11, 6 + smaller_height, smaller_width, smaller_channels = 6, 6, 3 + larger_height1, larger_width1 = 21, 21 # Stride 2 conv. + larger_height2, larger_width2 = 22, 22 # Stride 2 conv. + larger_height3, larger_width3 = 23, 23 # Conv and resize. + + feature_list = [] + feature_list.append(tf.zeros([batch, smaller_height, smaller_width, + smaller_channels])) + feature_list.append(tf.zeros([batch, smaller_height, smaller_width, + channels])) + feature_list.append(tf.zeros([batch, height, width, smaller_channels])) + feature_list.append(tf.zeros([batch, height, width, channels])) + feature_list.append(tf.zeros([batch, larger_height1, larger_width1, + channels])) + feature_list.append(tf.zeros([batch, larger_height1, larger_width1, + smaller_channels])) + feature_list.append(tf.zeros([batch, larger_height2, larger_width2, + smaller_channels])) + feature_list.append(tf.zeros([batch, larger_height3, larger_width3, + smaller_channels])) + layer = resized_fuse.ResizedFuse(name='fuse', + height=height, + width=width, + num_channels=channels) + output = layer(feature_list) + self.assertEqual(output.get_shape().as_list(), [batch, height, width, + channels]) + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/squeeze_and_excite.py b/model/layers/squeeze_and_excite.py new file mode 100644 index 0000000000000000000000000000000000000000..d77d73b66dacd3faa47f59106a69d4da1bc6cc10 --- /dev/null +++ b/model/layers/squeeze_and_excite.py @@ -0,0 +1,186 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Squeeze and excite layer. + +This script implements the squeeze-and-excite (SE), proposed in +- Squeeze-and-Excitation Networks, Jie Hu, Li Shen, Samuel Albanie, +Gang Sun, Enhua Wu. In CVPR 2018. + +Recently, this SE operation is further simplied with a single fully +connected layer, referred as simplified_squeeze_and_excite in our +implementation. For details, please see +- Lee and Park proposed to use only one fully connected layer in SE. +CenterMask : Real-Time Anchor-Free Instance Segmentation. +Youngwan Lee and Jongyoul Park. In CVPR 2020. +""" +from typing import Optional + +from absl import logging +import tensorflow as tf + +from deeplab2.model import utils +from deeplab2.model.layers import activations + +layers = tf.keras.layers + + +class SimplifiedSqueezeAndExcite(tf.keras.layers.Layer): + """A simplified squeeze-and-excite layer. + + Original squeeze-and-exciation (SE) is proposed in + Squeeze-and-Excitation Networks, Jie Hu, Li Shen, Samuel Albanie, + Gang Sun, Enhua Wu. In CVPR 2018. + + Lee and Park proposed to use only one fully connected layer in SE. + CenterMask : Real-Time Anchor-Free Instance Segmentation. + Youngwan Lee and Jongyoul Park. In CVPR 2020. + + In this function, we implement the simplified version of SE. + + Additionally, we follow MobileNetv3 to use the hard sigmoid function. + """ + + def __init__(self, squeeze_channels, name=None): + """Initializes a simplified squeeze-and-excite layer. + + Args: + squeeze_channels: Integer, channels for the squeezed features. + name: An optional string specifying the operation name. + """ + super(SimplifiedSqueezeAndExcite, self).__init__(name=name) + self._squeeze_channels = squeeze_channels + + self._se_conv = layers.Conv2D(self._squeeze_channels, + 1, + name='squeeze_and_excite', + use_bias=True, + kernel_initializer='VarianceScaling') + self._hard_sigmoid = activations.get_activation('hard_sigmoid') + + def call(self, input_tensor): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. + + Returns: + The output tensor. + """ + pooled = tf.reduce_mean(input_tensor, [1, 2], keepdims=True) + squeezed = self._se_conv(pooled) + excited = self._hard_sigmoid(squeezed) * input_tensor + return excited + + def get_config(self): + config = { + 'squeeze_channels': self._squeeze_channels, + } + base_config = super(SimplifiedSqueezeAndExcite, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + +class SqueezeAndExcite(tf.keras.layers.Layer): + """Creates a squeeze and excitation layer. + + Reference: Squeeze-and-Excitation Networks, Jie Hu, Li Shen, Samuel Albanie, + Gang Sun, Enhua Wu. In CVPR 2018. + This implementation follows the original SE and differs from the above + simplified version. + """ + + def __init__( + self, + in_filters: int, + out_filters: int, + se_ratio: float, + divisible_by: int = 1, + kernel_initializer: str = 'VarianceScaling', + kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, + bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, + activation: str = 'relu', + gating_activation: str = 'sigmoid', + name: Optional[str] = None): + """Initializes a squeeze and excitation layer. + + Args: + in_filters: The number of filters that se_ratio should be applied to. + out_filters: The number of filters of the output tensor. + se_ratio: The SE ratio for the squeeze and excitation layer. + divisible_by: An `int` that ensures all inner dimensions are divisible by + this number. + kernel_initializer: The kernel_initializer for convolutional + layers. + kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for + Conv2D. Default to None. + bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d. + Default to None. + activation: The name of the activation function. + gating_activation: The name of the activation function for final + gating function. + name: The layer name. + """ + super(SqueezeAndExcite, self).__init__(name=name) + + self._in_filters = in_filters + self._out_filters = out_filters + self._se_ratio = se_ratio + self._divisible_by = divisible_by + self._activation = activation + self._gating_activation = gating_activation + self._kernel_initializer = kernel_initializer + self._kernel_regularizer = kernel_regularizer + self._bias_regularizer = bias_regularizer + if tf.keras.backend.image_data_format() == 'channels_last': + self._spatial_axis = [1, 2] + else: + self._spatial_axis = [2, 3] + self._activation_fn = activations.get_activation(activation) + self._gating_activation_fn = activations.get_activation(gating_activation) + + num_reduced_filters = utils.make_divisible( + max(1, int(self._in_filters * self._se_ratio)), + divisor=self._divisible_by) + if self._se_ratio > 1.0: + logging.warn('Squeezing ratio %d is larger than 1.0.', self._se_ratio) + + self._se_reduce = tf.keras.layers.Conv2D( + filters=num_reduced_filters, + kernel_size=1, + strides=1, + padding='same', + use_bias=True, + kernel_initializer=self._kernel_initializer, + kernel_regularizer=self._kernel_regularizer, + bias_regularizer=self._bias_regularizer, + name=name + '_reduce') + + self._se_expand = tf.keras.layers.Conv2D( + filters=self._out_filters, + kernel_size=1, + strides=1, + padding='same', + use_bias=True, + kernel_initializer=self._kernel_initializer, + kernel_regularizer=self._kernel_regularizer, + bias_regularizer=self._bias_regularizer, + name=name + '_expand') + + def call(self, inputs): + x = tf.reduce_mean(inputs, self._spatial_axis, keepdims=True) + x = self._activation_fn(self._se_reduce(x)) + x = self._gating_activation_fn(self._se_expand(x)) + return x * inputs diff --git a/model/layers/squeeze_and_excite_test.py b/model/layers/squeeze_and_excite_test.py new file mode 100644 index 0000000000000000000000000000000000000000..6ae4c864378d731fb259a76e3f23259298eba6a9 --- /dev/null +++ b/model/layers/squeeze_and_excite_test.py @@ -0,0 +1,50 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for squeeze_and_excite.py.""" + +import tensorflow as tf + +from deeplab2.model.layers import squeeze_and_excite + + +class SqueezeAndExciteTest(tf.test.TestCase): + + def test_simpliefied_squeeze_and_excite_input_output_shape(self): + # Test the shape of input and output of SimplifiedSqueezeAndExcite. + channels = 32 + input_tensor = tf.random.uniform(shape=(3, 65, 65, channels)) + layer_op = squeeze_and_excite.SimplifiedSqueezeAndExcite( + channels) + output_tensor = layer_op(input_tensor) + self.assertListEqual(input_tensor.get_shape().as_list(), + output_tensor.get_shape().as_list()) + + def test_squeeze_and_excite_input_output_shape(self): + # Test the shape of input and output of SqueezeAndExcite. + channels = 32 + input_tensor = tf.random.uniform(shape=(3, 65, 65, channels)) + layer_op = squeeze_and_excite.SqueezeAndExcite( + in_filters=channels, + out_filters=channels, + se_ratio=8, + name='se') + output_tensor = layer_op(input_tensor) + self.assertListEqual(input_tensor.get_shape().as_list(), + output_tensor.get_shape().as_list()) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/layers/stems.py b/model/layers/stems.py new file mode 100644 index 0000000000000000000000000000000000000000..03e315cbb89afc268a8b370c2a031fda181dfdcf --- /dev/null +++ b/model/layers/stems.py @@ -0,0 +1,113 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This script contains STEMs for neural networks. + +The `STEM` is defined as the first few convolutions that process the input +image to a spatially smaller feature map (e.g., output stride = 2). + + +Reference code: +https://github.com/tensorflow/models/blob/master/research/deeplab/core/resnet_v1_beta.py +""" +import tensorflow as tf + +from deeplab2.model.layers import convolutions + +layers = tf.keras.layers + + +class InceptionSTEM(tf.keras.layers.Layer): + """A InceptionSTEM layer. + + This class builds an InceptionSTEM layer which can be used to as the first + few layers in a neural network. In particular, InceptionSTEM contains three + consecutive 3x3 colutions. + + Reference: + - Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, and Alexander Alemi. + "Inception-v4, inception-resnet and the impact of residual connections on + learning." In AAAI, 2017. + """ + + def __init__(self, + bn_layer=tf.keras.layers.BatchNormalization, + width_multiplier=1.0, + conv_kernel_weight_decay=0.0, + activation='relu'): + """Creates the InceptionSTEM layer. + + Args: + bn_layer: An optional tf.keras.layers.Layer that computes the + normalization (default: tf.keras.layers.BatchNormalization). + width_multiplier: A float multiplier, controlling the value of + convolution output channels. + conv_kernel_weight_decay: A float, the weight decay for convolution + kernels. + activation: A string specifying an activation function to be used in this + stem. + """ + super(InceptionSTEM, self).__init__(name='stem') + + self._conv1_bn_act = convolutions.Conv2DSame( + output_channels=int(64 * width_multiplier), + kernel_size=3, + name='conv1_bn_act', + strides=2, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=activation, + conv_kernel_weight_decay=conv_kernel_weight_decay) + + self._conv2_bn_act = convolutions.Conv2DSame( + output_channels=int(64 * width_multiplier), + kernel_size=3, + name='conv2_bn_act', + strides=1, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation=activation, + conv_kernel_weight_decay=conv_kernel_weight_decay) + + self._conv3_bn = convolutions.Conv2DSame( + output_channels=int(128 * width_multiplier), + kernel_size=3, + strides=1, + use_bias=False, + use_bn=True, + bn_layer=bn_layer, + activation='none', + name='conv3_bn', + conv_kernel_weight_decay=conv_kernel_weight_decay) + + def call(self, input_tensor, training=False): + """Performs a forward pass. + + Args: + input_tensor: An input tensor of type tf.Tensor with shape [batch, height, + width, channels]. + training: A boolean flag indicating whether training behavior should be + used (default: False). + + Returns: + Two output tensors. The first output tensor is not activated. The second + tensor is activated. + """ + x = self._conv1_bn_act(input_tensor, training=training) + x = self._conv2_bn_act(x, training=training) + x = self._conv3_bn(x, training=training) + return x diff --git a/model/layers/stems_test.py b/model/layers/stems_test.py new file mode 100644 index 0000000000000000000000000000000000000000..bac14055be6b1cf8f100e1a18cdeb59834471cad --- /dev/null +++ b/model/layers/stems_test.py @@ -0,0 +1,40 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for resnet_utils.""" +import tensorflow as tf + +from deeplab2.model.layers import stems +from deeplab2.utils import test_utils + + +class ResnetUtilsTest(tf.test.TestCase): + + def test_inception_stem_output_shape(self): + batch = 2 + height, width = 65, 65 + input_tensor = test_utils.create_test_input(batch, height, width, 3) + model = stems.InceptionSTEM() + output_tensor = model(input_tensor) + expected_height = (height - 1) / 2 + 1 + expected_width = (width - 1) / 2 + 1 + expected_channels = 128 + self.assertListEqual( + output_tensor.get_shape().as_list(), + [batch, expected_height, expected_width, expected_channels]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/loss/base_loss.py b/model/loss/base_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..d4614c8b03e8a7a456eb73d7b8d0a685f86757cf --- /dev/null +++ b/model/loss/base_loss.py @@ -0,0 +1,559 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains basic loss classes used in the DeepLab model.""" + +from typing import Text, Dict, Callable, Optional + +import tensorflow as tf +from deeplab2.model import utils + + +def compute_average_top_k_loss(loss: tf.Tensor, + top_k_percentage: float) -> tf.Tensor: + """Computes the avaerage top-k loss per sample. + + Args: + loss: A tf.Tensor with 2 or more dimensions of shape [batch, ...]. + top_k_percentage: A float representing the % of pixel that should be used + for calculating the loss. + + Returns: + A tensor of shape [batch] containing the mean top-k loss per sample. Due to + the use of different tf.strategy, we return the loss per sample and require + explicit averaging by the user. + """ + loss = tf.reshape(loss, shape=(tf.shape(loss)[0], -1)) + + if top_k_percentage != 1.0: + num_elements_per_sample = tf.shape(loss)[1] + top_k_pixels = tf.cast( + tf.math.round(top_k_percentage * + tf.cast(num_elements_per_sample, tf.float32)), tf.int32) + + def top_k_1d(inputs): + return tf.math.top_k(inputs, top_k_pixels, sorted=False)[0] + loss = tf.map_fn(fn=top_k_1d, elems=loss) + + # Compute mean loss over spatial dimension. + num_non_zero = tf.reduce_sum(tf.cast(tf.not_equal(loss, 0.0), tf.float32), 1) + loss_sum_per_sample = tf.reduce_sum(loss, 1) + return tf.math.divide_no_nan(loss_sum_per_sample, num_non_zero) + + +def compute_mask_dice_loss(y_true: tf.Tensor, + y_pred: tf.Tensor, + prediction_activation='softmax') -> tf.Tensor: + """Computes the Mask Dice loss between y_true and y_pred masks. + + Reference: + [1] Milletari, F., Navab, N., Ahmadi, S.A.: V-net: Fully convolutional neural + networks for volumetric medical image segmentation. In: 3DV (2016) + https://arxiv.org/abs/1606.04797 + + Args: + y_true: A tf.Tensor of shape [batch, height, width, channels] (or [batch, + length, channels]) containing the ground-truth. The channel dimension + indicates the mask ID in MaX-DeepLab, instead of a "class" dimension in + the V-net paper. In our case, for all batch, height, width, (or batch, + length) the [batch, height, width, :] (or [batch, length, :]) should be + one-hot encodings only, with valid pixels having one and only one 1.0, and + with void pixels being all 0.0. The valid pixels of the masks do not and + should not overlap because of the non-overlapping definition of panoptic + segmentation. The output loss is computed and normalized by valid (not + void) pixels. + y_pred: A tf.Tensor of shape [batch, height, width, channels] (or [batch, + length, channels]) containing the prediction. + prediction_activation: A String indicating activation function of the + prediction. It should be either 'sigmoid' or 'softmax'. + + Returns: + A tf.Tensor of shape [batch, channels] with the computed dice loss value. + + Raises: + ValueError: An error occurs when prediction_activation is not either + 'sigmoid' or 'softmax'. + """ + tf.debugging.assert_rank_in( + y_pred, [3, 4], message='Input tensors y_pred must have rank 3 or 4.') + tf.debugging.assert_rank_in( + y_true, [3, 4], message='Input tensors y_true must have rank 3 or 4.') + + shape_list = y_true.shape.as_list() + batch, channels = shape_list[0], shape_list[-1] + if prediction_activation == 'sigmoid': + y_pred = tf.math.sigmoid(y_pred) + elif prediction_activation == 'softmax': + y_pred = tf.nn.softmax(y_pred, axis=-1) + else: + raise ValueError( + "prediction_activation should be either 'sigmoid' or 'softmax'") + + y_true_flat = tf.reshape(y_true, [batch, -1, channels]) + # valid_flat indicates labeled pixels in the groudtruth. y_true is one-hot + # encodings only, with valid pixels having one and only one 1.0, and with + # invalid pixels having 0.0 values in all the channels. The valid pixels of + # the masks do not overlap because of the non-overlapping definition of + # panoptic segmentation. + valid_flat = tf.reduce_sum(y_true_flat, axis=-1, keepdims=True) + y_pred_flat = tf.reshape( + y_pred, [batch, -1, channels]) * valid_flat + # Use smooth = 1 to avoid division by zero when both y_pred and y_true are + # zeros. + smooth = 1.0 + intersection = 2 * tf.reduce_sum(y_pred_flat * y_true_flat, axis=1) + smooth + denominator = (tf.reduce_sum(y_pred_flat, axis=1) + + tf.reduce_sum(y_true_flat, axis=1) + smooth) + loss = 1. - tf.math.divide_no_nan(intersection, denominator) + return loss + + +def mean_absolute_error(y_true: tf.Tensor, + y_pred: tf.Tensor, + force_keep_dims=False) -> tf.Tensor: + """Computes the per-pixel mean absolute error for 3D and 4D tensors. + + Default reduction behavior: If a 3D tensor is used, no reduction is applied. + In case of a 4D tensor, reduction is applied. This behavior can be overridden + by force_keep_dims. + Note: tf.keras.losses.mean_absolute_error always reduces the output by one + dimension. + + Args: + y_true: A tf.Tensor of shape [batch, height, width] or [batch, height, + width, channels] containing the ground-truth. + y_pred: A tf.Tensor of shape [batch, height, width] or [batch, height, + width, channels] containing the prediction. + force_keep_dims: A boolean flag specifying whether no reduction should be + applied. + + Returns: + A tf.Tensor with the mean absolute error. + """ + tf.debugging.assert_rank_in( + y_pred, [3, 4], message='Input tensors must have rank 3 or 4.') + if len(y_pred.shape.as_list()) == 3 or force_keep_dims: + return tf.abs(y_true - y_pred) + else: + return tf.reduce_mean(tf.abs(y_true - y_pred), axis=[3]) + + +def mean_squared_error(y_true: tf.Tensor, + y_pred: tf.Tensor, + force_keep_dims=False) -> tf.Tensor: + """Computes the per-pixel mean squared error for 3D and 4D tensors. + + Default reduction behavior: If a 3D tensor is used, no reduction is applied. + In case of a 4D tensor, reduction is applied. This behavior can be overridden + by force_keep_dims. + Note: tf.keras.losses.mean_squared_error always reduces the output by one + dimension. + + Args: + y_true: A tf.Tensor of shape [batch, height, width] or [batch, height, + width, channels] containing the ground-truth. + y_pred: A tf.Tensor of shape [batch, height, width] or [batch, height, + width, channels] containing the prediction. + force_keep_dims: A boolean flag specifying whether no reduction should be + applied. + + Returns: + A tf.Tensor with the mean squared error. + """ + tf.debugging.assert_rank_in( + y_pred, [3, 4], message='Input tensors must have rank 3 or 4.') + if len(y_pred.shape.as_list()) == 3 or force_keep_dims: + return tf.square(y_true - y_pred) + else: + return tf.reduce_mean(tf.square(y_true - y_pred), axis=[3]) + + +def encode_one_hot(gt: tf.Tensor, + num_classes: int, + weights: tf.Tensor, + ignore_label: Optional[int]): + """Helper function for one-hot encoding of integer labels. + + Args: + gt: A tf.Tensor providing ground-truth information. Integer type label. + num_classes: An integer indicating the number of classes considered in the + ground-truth. It is used as 'depth' in tf.one_hot(). + weights: A tf.Tensor containing weights information. + ignore_label: An integer specifying the ignore label or None. + + Returns: + gt: A tf.Tensor of one-hot encoded gt labels. + weights: A tf.Tensor with ignore_label considered. + """ + if ignore_label is not None: + keep_mask = tf.cast(tf.not_equal(gt, ignore_label), dtype=tf.float32) + else: + keep_mask = tf.ones_like(gt, dtype=tf.float32) + gt = tf.stop_gradient(tf.one_hot(gt, num_classes)) + weights = tf.multiply(weights, keep_mask) + return gt, weights + + +def is_one_hot(gt: tf.Tensor, pred: tf.Tensor): + """Helper function for checking if gt tensor is one-hot encoded or not. + + Args: + gt: A tf.Tensor providing ground-truth information. + pred: A tf.Tensor providing prediction information. + + Returns: + A boolean indicating whether the gt is one-hot encoded (True) or + in integer type (False). + """ + gt_shape = gt.get_shape().as_list() + pred_shape = pred.get_shape().as_list() + # If the ground truth is one-hot encoded, the rank of the ground truth should + # match that of the prediction. In addition, we check that the first + # dimension, batch_size, and the last dimension, channels, should also match + # the prediction. However, we still allow spatial dimensions, e.g., height and + # width, to be different since we will downsample the ground truth if needed. + return (len(gt_shape) == len(pred_shape) and + gt_shape[0] == pred_shape[0] and gt_shape[-1] == pred_shape[-1]) + + +def _ensure_topk_value_is_percentage(top_k_percentage: float): + """Checks if top_k_percentage is between 0.0 and 1.0. + + Args: + top_k_percentage: The floating point value to check. + """ + if top_k_percentage < 0.0 or top_k_percentage > 1.0: + raise ValueError('The top-k percentage parameter must lie within 0.0 and ' + '1.0, but %f was given' % top_k_percentage) + + +class TopKGeneralLoss(tf.keras.losses.Loss): + """This class contains code to compute the top-k loss.""" + + def __init__(self, + loss_function: Callable[[tf.Tensor, tf.Tensor], tf.Tensor], + gt_key: Text, + pred_key: Text, + weight_key: Text, + top_k_percent_pixels: float = 1.0): + """Initializes a top-k L1 loss. + + Args: + loss_function: A callable loss function. + gt_key: A key to extract the ground-truth tensor. + pred_key: A key to extract the prediction tensor. + weight_key: A key to extract the weight tensor. + top_k_percent_pixels: An optional float specifying the percentage of + pixels used to compute the loss. The value must lie within [0.0, 1.0]. + """ + # Implicit reduction might mess with tf.distribute.Strategy, hence we + # explicitly reduce the loss. + super(TopKGeneralLoss, + self).__init__(reduction=tf.keras.losses.Reduction.NONE) + + _ensure_topk_value_is_percentage(top_k_percent_pixels) + + self._loss_function = loss_function + self._top_k_percent_pixels = top_k_percent_pixels + self._gt_key = gt_key + self._pred_key = pred_key + self._weight_key = weight_key + + def call(self, y_true: Dict[Text, tf.Tensor], + y_pred: Dict[Text, tf.Tensor]) -> tf.Tensor: + """Computes the top-k loss. + + Args: + y_true: A dict of tensors providing ground-truth information. + y_pred: A dict of tensors providing predictions. + + Returns: + A tensor of shape [batch] containing the loss per sample. + """ + gt = y_true[self._gt_key] + pred = y_pred[self._pred_key] + weights = y_true[self._weight_key] + + per_pixel_loss = self._loss_function(gt, pred) + per_pixel_loss = tf.multiply(per_pixel_loss, weights) + + return compute_average_top_k_loss(per_pixel_loss, + self._top_k_percent_pixels) + + +class TopKCrossEntropyLoss(tf.keras.losses.Loss): + """This class contains code for top-k cross-entropy.""" + + def __init__(self, + gt_key: Text, + pred_key: Text, + weight_key: Text, + num_classes: Optional[int], + ignore_label: Optional[int], + top_k_percent_pixels: float = 1.0, + dynamic_weight: bool = False): + """Initializes a top-k cross entropy loss. + + Args: + gt_key: A key to extract the ground-truth tensor. + pred_key: A key to extract the prediction tensor. + weight_key: A key to extract the weight tensor. + num_classes: An integer specifying the number of classes in the dataset. + ignore_label: An optional integer specifying the ignore label or None. + top_k_percent_pixels: An optional float specifying the percentage of + pixels used to compute the loss. The value must lie within [0.0, 1.0]. + dynamic_weight: A boolean indicating whether the weights are determined + dynamically w.r.t. the class confidence of each predicted mask. + + Raises: + ValueError: An error occurs when top_k_percent_pixels is not between 0.0 + and 1.0. + """ + # Implicit reduction might mess with tf.distribute.Strategy, hence we + # explicitly reduce the loss. + super(TopKCrossEntropyLoss, + self).__init__(reduction=tf.keras.losses.Reduction.NONE) + + _ensure_topk_value_is_percentage(top_k_percent_pixels) + + self._num_classes = num_classes + self._ignore_label = ignore_label + self._top_k_percent_pixels = top_k_percent_pixels + self._gt_key = gt_key + self._pred_key = pred_key + self._weight_key = weight_key + self._dynamic_weight = dynamic_weight + + def call(self, y_true: Dict[Text, tf.Tensor], + y_pred: Dict[Text, tf.Tensor]) -> tf.Tensor: + """Computes the top-k cross-entropy loss. + + Args: + y_true: A dict of tensors providing ground-truth information. The tensors + can be either integer type or one-hot encoded. When is integer type, the + shape can be either [batch, num_elements] or [batch, height, width]. + When one-hot encoded, the shape can be [batch, num_elements, channels] + or [batch, height, width, channels]. + y_pred: A dict of tensors providing predictions. The tensors are of shape + [batch, num_elements, channels] or [batch, height, width, channels]. If + the prediction is 2D (with height and width), we allow the spatial + dimension to be strided_height and strided_width. In this case, we + downsample the ground truth accordingly. + + Returns: + A tensor of shape [batch] containing the loss per image. + + Raises: + ValueError: If the prediction is 1D (with the length dimension) but its + length does not match that of the ground truth. + """ + gt = y_true[self._gt_key] + pred = y_pred[self._pred_key] + gt_shape = gt.get_shape().as_list() + pred_shape = pred.get_shape().as_list() + if self._dynamic_weight: + weights = y_pred[self._weight_key] + else: + weights = y_true[self._weight_key] + + # Downsample the ground truth for 2D prediction cases. + if len(pred_shape) == 4 and gt_shape[1:3] != pred_shape[1:3]: + gt = utils.strided_downsample(gt, pred_shape[1:3]) + weights = utils.strided_downsample(weights, pred_shape[1:3]) + elif len(pred_shape) == 3 and gt_shape[1] != pred_shape[1]: + # We don't support downsampling for 1D predictions. + raise ValueError('The shape of gt does not match the shape of pred.') + + if is_one_hot(gt, pred): + gt = tf.cast(gt, tf.float32) + else: + gt = tf.cast(gt, tf.int32) + gt, weights = encode_one_hot(gt, self._num_classes, weights, + self._ignore_label) + pixel_losses = tf.keras.backend.categorical_crossentropy( + gt, pred, from_logits=True) + weighted_pixel_losses = tf.multiply(pixel_losses, weights) + + return compute_average_top_k_loss(weighted_pixel_losses, + self._top_k_percent_pixels) + + +class FocalCrossEntropyLoss(tf.keras.losses.Loss): + """This class contains code for focal cross-entropy.""" + + def __init__(self, + gt_key: Text, + pred_key: Text, + weight_key: Text, + num_classes: Optional[int], + ignore_label: Optional[int], + focal_loss_alpha: float = 0.75, + focal_loss_gamma: float = 0.0, + background_channel_index: int = -1, + dynamic_weight: bool = True): + """Initializes a focal cross entropy loss. + + FocalCrossEntropyLoss supports focal-loss mode with integer + or one-hot ground-truth labels. + Reference: + [1] Lin, T. Y., Goyal, P., Girshick, R., He, K., & Dollár, P. Focal loss for + dense object detection. In Proceedings of the IEEE International + Conference on Computer Vision (ICCV). (2017) + https://arxiv.org/abs/1708.02002 + + Args: + gt_key: A key to extract the ground-truth tensor. + pred_key: A key to extract the prediction tensor. + weight_key: A key to extract the weight tensor. + num_classes: An integer specifying the number of classes in the dataset. + ignore_label: An optional integer specifying the ignore label or None. + Only effective when ground truth labels are in integer mode. + focal_loss_alpha: An optional float specifying the coefficient that + weights between positive (matched) and negative (unmatched) masks in + focal loss. The positives are weighted by alpha, while the negatives + are weighted by (1. - alpha). Default to 0.75. + focal_loss_gamma: An optional float specifying the coefficient that + weights probability (pt) term in focal loss. Focal loss = - ((1 - pt) ^ + gamma) * log(pt). Default to 0.0. + background_channel_index: The index for background channel. When alpha + is used, we assume the last channel is background and others are + foreground. Default to -1. + dynamic_weight: A boolean indicating whether the weights are determined + dynamically w.r.t. the class confidence of each predicted mask. + """ + # Implicit reduction might mess with tf.distribute.Strategy, hence we + # explicitly reduce the loss. + super(FocalCrossEntropyLoss, + self).__init__(reduction=tf.keras.losses.Reduction.NONE) + + self._num_classes = num_classes + self._ignore_label = ignore_label + self._focal_loss_alpha = focal_loss_alpha + self._focal_loss_gamma = focal_loss_gamma + self._background_channel_index = background_channel_index + self._gt_key = gt_key + self._pred_key = pred_key + self._weight_key = weight_key + self._dynamic_weight = dynamic_weight + + def call(self, y_true: Dict[Text, tf.Tensor], + y_pred: Dict[Text, tf.Tensor]) -> tf.Tensor: + """Computes the focal cross-entropy loss. + + Args: + y_true: A dict of tensors providing ground-truth information. The tensors + can be either integer type or one-hot encoded. When is integer type, the + shape can be either [batch, num_elements] or [batch, height, width]. + When one-hot encoded, the shape can be [batch, num_elements, channels] + or [batch, height, width, channels]. + y_pred: A dict of tensors providing predictions. The tensors are of shape + [batch, num_elements, channels] or [batch, height, width, channels]. + + Returns: + A tensor of shape [batch] containing the loss per image. + + """ + + gt = y_true[self._gt_key] + pred = y_pred[self._pred_key] + if self._dynamic_weight: + # Dynamic weights w.r.t. the class confidence of each predicted mask. + weights = y_pred[self._weight_key] + else: + weights = y_true[self._weight_key] + + if is_one_hot(gt, pred): + gt = tf.cast(gt, tf.float32) + else: + gt = tf.cast(gt, tf.int32) + gt, weights = encode_one_hot(gt, self._num_classes, weights, + self._ignore_label) + pixel_losses = tf.nn.softmax_cross_entropy_with_logits(gt, pred) + # Focal loss + if self._focal_loss_gamma == 0.0: + pixel_focal_losses = pixel_losses + else: + predictions = tf.nn.softmax(pred, axis=-1) + pt = tf.reduce_sum(predictions * gt, axis=-1) + pixel_focal_losses = tf.multiply( + tf.pow(1.0 - pt, self._focal_loss_gamma), pixel_losses) + + if self._focal_loss_alpha >= 0: + # alpha_weights = alpha * positive masks + (1 - alpha) * negative masks. + alpha = self._focal_loss_alpha + alpha_weights = ( + alpha * (1.0 - gt[..., self._background_channel_index]) + + (1 - alpha) * gt[..., self._background_channel_index]) + pixel_focal_losses = alpha_weights * pixel_focal_losses + weighted_pixel_losses = tf.multiply(pixel_focal_losses, weights) + weighted_pixel_losses = tf.reshape( + weighted_pixel_losses, shape=(tf.shape(weighted_pixel_losses)[0], -1)) + # Compute mean loss over spatial dimension. + num_non_zero = tf.reduce_sum( + tf.cast(tf.not_equal(weighted_pixel_losses, 0.0), tf.float32), 1) + loss_sum_per_sample = tf.reduce_sum(weighted_pixel_losses, 1) + return tf.math.divide_no_nan(loss_sum_per_sample, num_non_zero) + + +class MaskDiceLoss(tf.keras.losses.Loss): + """This class contains code to compute Mask Dice loss. + + The channel dimension in Mask Dice loss indicates the mask ID in MaX-DeepLab, + instead of a "class" dimension in the original Dice loss. + """ + + def __init__(self, + gt_key: Text, + pred_key: Text, + weight_key: Text, + prediction_activation='softmax'): + """Initializes a Mask Dice loss. + + Args: + gt_key: A key to extract the ground-truth tensor. + pred_key: A key to extract the pred tensor. + weight_key: A key to extract the weight tensor. + prediction_activation: A String indicating activation function of the + prediction. It should be either 'sigmoid' or 'softmax'. + """ + # Implicit reduction might mess with tf.distribute.Strategy, hence we + # explicitly reduce the loss. + super(MaskDiceLoss, self).__init__(reduction=tf.keras.losses.Reduction.NONE) + + self._gt_key = gt_key + self._pred_key = pred_key + self._weight_key = weight_key + self._prediction_activation = prediction_activation + + def call(self, y_true: Dict[Text, tf.Tensor], + y_pred: Dict[Text, tf.Tensor]) -> tf.Tensor: + """Computes the Mask Dice loss. + + Args: + y_true: A dict of tensors providing ground-truth information. + y_pred: A dict of tensors providing predictions. + + Returns: + A tensor of shape [batch] containing the loss per sample. + """ + gt = y_true[self._gt_key] + pred = y_pred[self._pred_key] + # Dynamic weights w.r.t. the class confidence of each predicted mask. + weights = y_pred[self._weight_key] + weighted_dice_losses = tf.multiply( + compute_mask_dice_loss(gt, pred, self._prediction_activation), + weights) + # Reduce_sum over the channels (i.e., number of masks). + return tf.reduce_sum(weighted_dice_losses, axis=-1) diff --git a/model/loss/base_loss_test.py b/model/loss/base_loss_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c6855eefa1a6d16a02e247e8bc3e9169b1ebdb17 --- /dev/null +++ b/model/loss/base_loss_test.py @@ -0,0 +1,279 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for base_loss.py.""" + +import numpy as np +import tensorflow as tf + +from deeplab2.model.loss import base_loss as loss + + +class BaseLossTest(tf.test.TestCase): + + def test_general_loss(self): + y_true = { + 'gt': tf.ones([2, 33, 33]) * 2, + 'weight': tf.ones([2, 33, 33]) + } + y_pred = {'pred': tf.zeros([2, 33, 33])} + + with self.subTest('L1'): + loss_layer = loss.TopKGeneralLoss( + loss.mean_absolute_error, + 'gt', + 'pred', + 'weight') + expected_loss = tf.ones([2]) * 2 + with self.subTest('MSE'): + loss_layer = loss.TopKGeneralLoss( + loss.mean_squared_error, + 'gt', + 'pred', + 'weight') + expected_loss = tf.ones([2]) * 4 + loss_result = loss_layer(y_true, y_pred) + np.testing.assert_almost_equal( + loss_result.numpy(), expected_loss.numpy(), decimal=5) + + def test_general_loss_weights(self): + weights = np.zeros((2, 33, 33)) + weights[:, 17:29, 15:23] = 1 + + gt = np.ones([2, 33, 33]) * 1.5 + gt[:, 17:29, 15:23] = 2 + + y_true = { + 'gt': tf.convert_to_tensor(gt, dtype=tf.float32), + 'weight': tf.convert_to_tensor(weights, dtype=tf.float32) + } + y_pred = {'pred': tf.zeros([2, 33, 33])} + loss_layer = loss.TopKGeneralLoss( + loss.mean_absolute_error, + 'gt', + 'pred', + 'weight') + + expected_loss = tf.ones([2]) * 2 + loss_result = loss_layer(y_true, y_pred) + + np.testing.assert_almost_equal( + loss_result.numpy(), expected_loss.numpy(), decimal=5) + + def test_topk_ce_loss_ignore(self): + num_classes = 19 + ignore_label = 255 + loss_layer = loss.TopKCrossEntropyLoss( + gt_key='gt', + pred_key='pred', + weight_key='weight', + num_classes=num_classes, + ignore_label=ignore_label) + + gt_tensor = np.ones(shape=[2, 33, 33], dtype=np.int32) * ignore_label + gt_tensor[:, 17:29, 15:23] = 1 + logits = tf.random.uniform(shape=[2, 33, 33, num_classes]) + + y_true = { + 'gt': tf.convert_to_tensor(gt_tensor), + 'weight': tf.ones([2, 33, 33]) + } + y_pred = {'pred': logits} + + expected_result = tf.nn.softmax_cross_entropy_with_logits( + tf.one_hot(np.squeeze(gt_tensor[:, 17:29, 15:23]), num_classes), + logits[:, 17:29, 15:23, :]) + expected_result = tf.reduce_mean(expected_result, axis=[1, 2]) + + per_sample_loss = loss_layer(y_true, y_pred) + + np.testing.assert_almost_equal( + per_sample_loss.numpy(), expected_result.numpy(), decimal=5) + + def test_topk_ce_loss_global_weight(self): + num_classes = 19 + weight = 3.145 + loss_layer = loss.TopKCrossEntropyLoss( + gt_key='gt', + pred_key='pred', + weight_key='weight', + num_classes=num_classes, + ignore_label=255) + logits = tf.random.uniform(shape=[2, 33, 33, num_classes]) + + y_true = { + 'gt': tf.ones([2, 33, 33], tf.int32), + 'weight': tf.ones([2, 33, 33]) + } + y_pred = {'pred': logits} + + expected_result = tf.nn.softmax_cross_entropy_with_logits( + tf.one_hot(y_true['gt'], num_classes), logits) + expected_result = tf.reduce_mean(expected_result, axis=[1, 2]) + expected_result *= weight + + per_sample_loss = loss_layer(y_true, y_pred, weight) + + np.testing.assert_almost_equal( + per_sample_loss.numpy(), expected_result.numpy(), decimal=5) + + def test_topk_ce_loss_topk(self): + num_classes = 19 + top_k = 0.5 + loss_layer = loss.TopKCrossEntropyLoss( + gt_key='gt', + pred_key='pred', + weight_key='weight', + num_classes=num_classes, + top_k_percent_pixels=top_k, + ignore_label=255) + + logits = tf.random.uniform(shape=[2, 33, 33, num_classes]) + y_true = { + 'gt': tf.ones([2, 33, 33], tf.int32), + 'weight': tf.ones([2, 33, 33]) + } + y_pred = {'pred': logits} + + expected_result = tf.nn.softmax_cross_entropy_with_logits( + tf.one_hot(y_true['gt'], num_classes), logits) + expected_result, _ = tf.math.top_k( + tf.reshape(expected_result, shape=[2, -1]), + tf.cast((top_k * tf.size(y_true['gt'], tf.float32) / 2), tf.int32)) + expected_result = tf.reduce_mean(expected_result, axis=[1]) + + per_sample_loss = loss_layer(y_true, y_pred) + + np.testing.assert_almost_equal( + per_sample_loss.numpy(), expected_result.numpy(), decimal=5) + + def test_is_one_hot(self): + num_classes = 19 + gt_list = [ + tf.ones([2, 33, 33], tf.int32), + tf.ones([2, 33], tf.int32), + tf.one_hot(tf.ones([2, 33, 33], tf.int32), num_classes), + tf.one_hot(tf.ones([2, 33], tf.int32), num_classes), + ] + pred_list = [ + tf.random.uniform(shape=[2, 33, 33, num_classes]), + tf.random.uniform(shape=[2, 33, num_classes]), + tf.random.uniform(shape=[2, 33, 33, num_classes]), + tf.random.uniform(shape=[2, 33, num_classes]), + ] + expected_result_list = [False, False, True, True] + output_list = [] + for gt, pred in zip(gt_list, pred_list): + output_list.append(loss.is_one_hot(gt, pred)) + np.testing.assert_equal(output_list, expected_result_list) + + def test_focal_ce_loss_integer_or_one_hot(self): + num_classes = 19 + gamma = 0.5 + alpha = 0.75 + loss_layer = loss.FocalCrossEntropyLoss( + gt_key='gt', + pred_key='pred', + weight_key='weight', + num_classes=num_classes, + focal_loss_alpha=alpha, + focal_loss_gamma=gamma, + ignore_label=255) + + logits = tf.random.uniform(shape=[2, 33 * 33, num_classes]) + gt = tf.ones([2, 33 * 33], tf.int32) + use_one_hot_encode_list = [False, True] + for use_one_hot_encode in use_one_hot_encode_list: + if use_one_hot_encode: + gt = tf.one_hot(gt, num_classes) + y_true = {'gt': gt} + y_pred = {'pred': logits, + 'weight': tf.ones([2, 33 * 33])} + predictions = tf.nn.softmax(logits, axis=-1) + if use_one_hot_encode: + pt = tf.reduce_sum(predictions * gt, axis=-1) + expected_result = tf.nn.softmax_cross_entropy_with_logits(gt, logits) + else: + pt = tf.reduce_sum(predictions * tf.one_hot(gt, num_classes), axis=-1) + expected_result = tf.nn.softmax_cross_entropy_with_logits( + tf.one_hot(gt, num_classes), logits) + expected_result = tf.multiply(tf.pow(1.0 - pt, gamma), expected_result) + expected_result = tf.reshape(expected_result, shape=[2, -1]) + # Since labels has no '19' (background) in this example, only alpha is + # multiplied. + expected_result = tf.reduce_mean(expected_result, axis=[1]) * alpha + per_sample_loss = loss_layer(y_true, y_pred) + + np.testing.assert_almost_equal( + per_sample_loss.numpy(), expected_result.numpy(), decimal=5) + + def test_mask_dice_loss(self): + gt = [ + [ + [1., 1., 1.], + [0., 0., 0.], + [0., 0., 0.], + ], + [ + [0., 0., 0.], + [1., 1., 1.], + [1., 1., 1.], + ], + ] + gt = tf.constant(gt, dtype=tf.float32) + gt = tf.expand_dims(gt, -1) + gt = tf.transpose(gt, perm=[3, 1, 2, 0]) + + y_true = {'gt': gt} + + pred = [ + [ + [1., 1., 0.], + [1., 1., 0.], + [1., 1., 0.], + ], + [ + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.], + ], + ] + # Multiply 100 to make its Softmax output have 0 or 1 values. + pred = tf.constant(pred, dtype=tf.float32) * 100. + pred = tf.expand_dims(pred, -1) + pred = tf.transpose(pred, perm=[3, 1, 2, 0]) + y_pred = { + 'pred': pred, + 'weight': tf.ones([1]) * 0.5 + } + + loss_layer = loss.MaskDiceLoss( + gt_key='gt', + pred_key='pred', + weight_key='weight', + prediction_activation='softmax') + dice_loss = loss_layer(y_true, y_pred) + loss_result = dice_loss.numpy() + # For each channel, + # nominator = 2 * intersection(=2) + smooth(=1) = 5 + # denominator = 9 + smooth(=1) = 10 + # Channel-wise sum: [5/10, 5/10] -> [1.0] + # Weighted result: [1.0] * weight(=0.5) = 0.5 + expected_result = np.array([0.5]) + np.testing.assert_almost_equal(loss_result, expected_result) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/loss/loss_builder.py b/model/loss/loss_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..89a1606c614e9aaa3666e09ad0e58c4ddf680cc8 --- /dev/null +++ b/model/loss/loss_builder.py @@ -0,0 +1,220 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains loss builder classes used in the DeepLab model.""" + +import collections +from typing import Any, Dict, Text, Tuple, Optional + +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.model.loss import base_loss +from deeplab2.model.loss import max_deeplab_loss + + +def _create_loss_and_weight( + loss_options: config_pb2.LossOptions.SingleLossOptions, gt_key: Text, + pred_key: Text, weight_key: Text, **kwargs: Any) -> tf.keras.losses.Loss: + """Creates a loss and its weight from loss options. + + Args: + loss_options: Loss options as defined by + config_pb2.LossOptions.SingleLossOptions or None. + gt_key: A key to extract the ground-truth from a dictionary. + pred_key: A key to extract the prediction from a dictionary. + weight_key: A key to extract the per-pixel weights from a dictionary. + **kwargs: Additional parameters to initialize the loss. + + Returns: + A tuple of an instance of tf.keras.losses.Loss and its corresponding weight + as an integer. + + Raises: + ValueError: An error occurs when the loss name is not a valid loss. + """ + if loss_options is None: + return None, 0 + if loss_options.name == 'softmax_cross_entropy': + return base_loss.TopKCrossEntropyLoss( + gt_key, + pred_key, + weight_key, + top_k_percent_pixels=loss_options.top_k_percent, + **kwargs), loss_options.weight + elif loss_options.name == 'l1': + return base_loss.TopKGeneralLoss( + base_loss.mean_absolute_error, + gt_key, + pred_key, + weight_key, + top_k_percent_pixels=loss_options.top_k_percent), loss_options.weight + elif loss_options.name == 'mse': + return base_loss.TopKGeneralLoss( + base_loss.mean_squared_error, + gt_key, + pred_key, + weight_key, + top_k_percent_pixels=loss_options.top_k_percent), loss_options.weight + + raise ValueError('Loss %s is not a valid loss.' % loss_options.name) + + +class DeepLabFamilyLoss(tf.keras.layers.Layer): + """This class contains code to build and call losses for DeepLabFamilyLoss.""" + + def __init__( + self, + loss_options: config_pb2.LossOptions, + num_classes: Optional[int], + ignore_label: Optional[int], + thing_class_ids: Tuple[int]): + """Initializes the losses for Panoptic-DeepLab. + + Args: + loss_options: Loss options as defined by config_pb2.LossOptions. + num_classes: An integer specifying the number of classes in the dataset. + ignore_label: An optional integer specifying the ignore label or None. + thing_class_ids: A tuple of length [N] containing N thing indices. + """ + super(DeepLabFamilyLoss, self).__init__(name='DeepLabFamilyLoss') + + # Single-term losses are losses that have only one loss term and thus each + # loss function directly returns a single tensor as the loss value, as + # opposed to multi-term losses that involve multiple terms and return a + # dictionary of loss values. + self._single_term_loss_func_and_weight_dict = collections.OrderedDict() + self._extra_loss_names = [common.TOTAL_LOSS] + + if loss_options.HasField(common.SEMANTIC_LOSS): + self._single_term_loss_func_and_weight_dict[ + common.SEMANTIC_LOSS] = _create_loss_and_weight( + loss_options.semantic_loss, + common.GT_SEMANTIC_KEY, + common.PRED_SEMANTIC_LOGITS_KEY, + common.SEMANTIC_LOSS_WEIGHT_KEY, + num_classes=num_classes, + ignore_label=ignore_label) + + if loss_options.HasField(common.CENTER_LOSS): + self._single_term_loss_func_and_weight_dict[ + common.CENTER_LOSS] = _create_loss_and_weight( + loss_options.center_loss, common.GT_INSTANCE_CENTER_KEY, + common.PRED_CENTER_HEATMAP_KEY, common.CENTER_LOSS_WEIGHT_KEY) + + if loss_options.HasField(common.REGRESSION_LOSS): + self._single_term_loss_func_and_weight_dict[ + common.REGRESSION_LOSS] = _create_loss_and_weight( + loss_options.regression_loss, common.GT_INSTANCE_REGRESSION_KEY, + common.PRED_OFFSET_MAP_KEY, common.REGRESSION_LOSS_WEIGHT_KEY) + + # Currently, only used for Motion-DeepLab. + if loss_options.HasField(common.MOTION_LOSS): + self._single_term_loss_func_and_weight_dict[ + common.MOTION_LOSS] = _create_loss_and_weight( + loss_options.motion_loss, common.GT_FRAME_OFFSET_KEY, + common.PRED_FRAME_OFFSET_MAP_KEY, + common.FRAME_REGRESSION_LOSS_WEIGHT_KEY) + + # Next-frame regression loss used in ViP-DeepLab. + if loss_options.HasField(common.NEXT_REGRESSION_LOSS): + self._single_term_loss_func_and_weight_dict[ + common.NEXT_REGRESSION_LOSS] = _create_loss_and_weight( + loss_options.next_regression_loss, + common.GT_NEXT_INSTANCE_REGRESSION_KEY, + common.PRED_NEXT_OFFSET_MAP_KEY, + common.NEXT_REGRESSION_LOSS_WEIGHT_KEY) + + # Multi-term losses that return dictionaries of loss terms. + self._multi_term_losses = [] + + # MaXDeepLabLoss optionally returns four loss terms in total: + # - common.PQ_STYLE_LOSS_CLASS_TERM + # - common.PQ_STYLE_LOSS_MASK_DICE_TERM + # - common.MASK_ID_CROSS_ENTROPY_LOSS + # - common.INSTANCE_DISCRIMINATION_LOSS + if any([loss_options.HasField('pq_style_loss'), + loss_options.HasField('mask_id_cross_entropy_loss'), + loss_options.HasField('instance_discrimination_loss')]): + self._multi_term_losses.append(max_deeplab_loss.MaXDeepLabLoss( + loss_options, ignore_label, thing_class_ids)) + + for multi_term_loss in self._multi_term_losses: + self._extra_loss_names += multi_term_loss.loss_terms + + def get_loss_names(self): + # Keep track of all the keys that will be returned in self.call(). + loss_names = list(self._single_term_loss_func_and_weight_dict.keys()) + return loss_names + self._extra_loss_names + + def call(self, y_true: Dict[Text, tf.Tensor], + y_pred: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]: + """Performs the loss computations given ground-truth and predictions. + + The loss is computed for each sample separately. Currently, smoothed + ground-truth labels are not supported. + + Args: + y_true: A dictionary of tf.Tensor containing all ground-truth data to + compute the loss. Depending on the configuration, the dict has to + contain common.GT_SEMANTIC_KEY, and optionally + common.GT_INSTANCE_CENTER_KEY, common.GT_INSTANCE_REGRESSION_KEY, and + common.GT_FRAME_OFFSET_KEY. + y_pred: A dicitionary of tf.Tensor containing all predictions to compute + the loss. Depending on the configuration, the dict has to contain + common.PRED_SEMANTIC_LOGITS_KEY, and optionally + common.PRED_CENTER_HEATMAP_KEY, common.PRED_OFFSET_MAP_KEY, and + common.PRED_FRAME_OFFSET_MAP_KEY. + + Returns: + The loss as a dict of tf.Tensor, optionally containing the following: + - common.SEMANTIC_LOSS: [batch]. + - common.CENTER_LOSS: [batch]. + - common.REGRESSION_LOSS: [batch]. + - common.MOTION_LOSS: [batch], the frame offset regression loss. + - common.NEXT_REGRESSION_LOSS: [batch], the next regression loss. + + Raises: + AssertionError: If the keys of the resulting_dict do not match + self.get_loss_names(). + AssertionError: The keys of the resulting_dict overlap with the keys of + the loss_dict. + """ + resulting_dict = collections.OrderedDict() + + # Single-term losses. + for loss_name, func_and_weight in ( + self._single_term_loss_func_and_weight_dict.items()): + loss_func, loss_weight = func_and_weight + loss_value = loss_func(y_true, y_pred) + resulting_dict[loss_name] = loss_value * loss_weight + + # Multi-term losses predict a dictionary, so we handle them differently. + for multi_term_loss in self._multi_term_losses: + loss_dict = multi_term_loss((y_true, y_pred)) + if not set(loss_dict).isdisjoint(resulting_dict): + raise AssertionError('The keys of the resulting_dict overlap with the ' + 'keys of the loss_dict.') + resulting_dict.update(loss_dict) + + # Also include the total loss in the resulting_dict. + total_loss = tf.math.accumulate_n(list(resulting_dict.values())) + resulting_dict[common.TOTAL_LOSS] = total_loss + + if sorted(resulting_dict.keys()) != sorted(self.get_loss_names()): + raise AssertionError( + 'The keys of the resulting_dict should match self.get_loss_names().') + return resulting_dict diff --git a/model/loss/loss_builder_test.py b/model/loss/loss_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0abcddd15bdbf6aa1434c8a5a5edf1bc91899c51 --- /dev/null +++ b/model/loss/loss_builder_test.py @@ -0,0 +1,224 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for loss_builder.py.""" + +import numpy as np +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import trainer_pb2 +from deeplab2.model.loss import loss_builder as loss + + +class LossTest(tf.test.TestCase): + + def test_panoptic_deeplab_loss(self): + ignore_label = 255 + num_classes = 19 + semantic_loss_options = trainer_pb2.LossOptions.SingleLossOptions( + name='softmax_cross_entropy') + center_loss_options = trainer_pb2.LossOptions.SingleLossOptions(name='mse') + regression_loss_options = trainer_pb2.LossOptions.SingleLossOptions( + name='l1') + motion_loss_options = trainer_pb2.LossOptions.SingleLossOptions( + name='l1') + loss_options = trainer_pb2.LossOptions( + semantic_loss=semantic_loss_options, + center_loss=center_loss_options, + regression_loss=regression_loss_options, + motion_loss=motion_loss_options) + + loss_layer = loss.DeepLabFamilyLoss( + loss_options, + num_classes=num_classes, + ignore_label=ignore_label, + thing_class_ids=tuple(range(11, 19))) + + pred_dict = { + common.PRED_SEMANTIC_LOGITS_KEY: + tf.random.uniform(shape=[2, 33, 33, num_classes]), + common.PRED_CENTER_HEATMAP_KEY: + tf.zeros(shape=[2, 33, 33]), + common.PRED_OFFSET_MAP_KEY: + tf.zeros(shape=[2, 33, 33, 2]), + common.PRED_FRAME_OFFSET_MAP_KEY: + tf.zeros(shape=[2, 33, 33, 2]), + } + + with self.subTest('Test center loss.'): + gt_dict = { + common.GT_SEMANTIC_KEY: + tf.ones(shape=[2, 33, 33]) * ignore_label, + common.GT_INSTANCE_CENTER_KEY: + tf.ones(shape=[2, 33, 33]) * 2, + common.GT_INSTANCE_REGRESSION_KEY: + tf.zeros(shape=[2, 33, 33, 2]), + common.GT_FRAME_OFFSET_KEY: + tf.zeros(shape=[2, 33, 33, 2]), + common.SEMANTIC_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.CENTER_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.REGRESSION_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.FRAME_REGRESSION_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + } + # expected_result = square(2 - 0). + expected_result = tf.ones(shape=[2]) * 4 + loss_result = loss_layer(gt_dict, pred_dict)[common.TOTAL_LOSS] + + np.testing.assert_equal(loss_result.numpy(), expected_result.numpy()) + + with self.subTest('Test regression loss.'): + gt_dict = { + common.GT_SEMANTIC_KEY: + tf.ones(shape=[2, 33, 33]) * ignore_label, + common.GT_INSTANCE_CENTER_KEY: + tf.zeros(shape=[2, 33, 33]), + common.GT_INSTANCE_REGRESSION_KEY: + tf.ones(shape=[2, 33, 33, 2]) * 2, + common.GT_FRAME_OFFSET_KEY: + tf.ones(shape=[2, 33, 33, 2]) * 2, + common.SEMANTIC_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.CENTER_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.REGRESSION_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.FRAME_REGRESSION_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + } + expected_result = tf.ones(shape=[2]) * 4 + loss_result = loss_layer(gt_dict, pred_dict)[common.TOTAL_LOSS] + + np.testing.assert_equal(loss_result.numpy(), expected_result.numpy()) + + with self.subTest('Test instances losses.'): + gt_dict = { + common.GT_SEMANTIC_KEY: + tf.ones(shape=[2, 33, 33]) * ignore_label, + common.GT_INSTANCE_CENTER_KEY: + tf.ones(shape=[2, 33, 33]) * 2, + common.GT_INSTANCE_REGRESSION_KEY: + tf.ones(shape=[2, 33, 33, 2]) * 2, + common.GT_FRAME_OFFSET_KEY: + tf.ones(shape=[2, 33, 33, 2]) * 2, + common.SEMANTIC_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.CENTER_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.REGRESSION_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.FRAME_REGRESSION_LOSS_WEIGHT_KEY: + tf.zeros(shape=[2, 33, 33]), + } + expected_result = tf.ones(shape=[2]) * 6 + loss_result = loss_layer(gt_dict, pred_dict)[common.TOTAL_LOSS] + + np.testing.assert_equal(loss_result.numpy(), expected_result.numpy()) + + with self.subTest('Test all losses.'): + gt_dict = { + common.GT_SEMANTIC_KEY: + tf.ones(shape=[2, 33, 33], dtype=tf.int32), + common.GT_INSTANCE_CENTER_KEY: + tf.ones(shape=[2, 33, 33]) * 2, + common.GT_INSTANCE_REGRESSION_KEY: + tf.ones(shape=[2, 33, 33, 2]) * 2, + common.GT_FRAME_OFFSET_KEY: + tf.ones(shape=[2, 33, 33, 2]) * 2, + common.SEMANTIC_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.CENTER_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.REGRESSION_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + common.FRAME_REGRESSION_LOSS_WEIGHT_KEY: + tf.ones(shape=[2, 33, 33]), + } + expected_result = tf.nn.softmax_cross_entropy_with_logits( + tf.one_hot(gt_dict[common.GT_SEMANTIC_KEY], num_classes), + pred_dict[common.PRED_SEMANTIC_LOGITS_KEY]) + expected_result = tf.reduce_mean(expected_result, axis=[1, 2]) + # Add center and regression loss. + expected_result += tf.ones(shape=[2]) * 8 + + loss_result = loss_layer(gt_dict, pred_dict)[common.TOTAL_LOSS] + + np.testing.assert_equal(loss_result.numpy(), expected_result.numpy()) + + def test_panoptic_deeplab_semantic_loss_only(self): + ignore_label = 255 + num_classes = 19 + semantic_loss_options = trainer_pb2.LossOptions.SingleLossOptions( + name='softmax_cross_entropy') + loss_options = trainer_pb2.LossOptions( + semantic_loss=semantic_loss_options) + + loss_layer = loss.DeepLabFamilyLoss( + loss_options, + num_classes=num_classes, + ignore_label=ignore_label, + thing_class_ids=tuple(range(11, 19))) + + pred_dict = { + common.PRED_SEMANTIC_LOGITS_KEY: + tf.random.uniform(shape=[2, 33, 33, num_classes]), + } + gt_dict = { + common.GT_SEMANTIC_KEY: tf.ones(shape=[2, 33, 33], dtype=tf.int32), + common.SEMANTIC_LOSS_WEIGHT_KEY: tf.ones(shape=[2, 33, 33]), + } + + expected_result = tf.nn.softmax_cross_entropy_with_logits( + tf.one_hot(gt_dict[common.GT_SEMANTIC_KEY], num_classes), + pred_dict[common.PRED_SEMANTIC_LOGITS_KEY]) + expected_result = tf.reduce_mean(expected_result, axis=[1, 2]) + + loss_dict = loss_layer(gt_dict, pred_dict) + self.assertIn(common.SEMANTIC_LOSS, loss_dict) + self.assertNotIn(common.CENTER_LOSS, loss_dict) + self.assertNotIn(common.REGRESSION_LOSS, loss_dict) + self.assertNotIn(common.MOTION_LOSS, loss_dict) + loss_result = loss_dict[common.SEMANTIC_LOSS] + + np.testing.assert_equal(loss_result.numpy(), expected_result.numpy()) + + def test_panoptic_deeplab_loss_error(self): + semantic_loss_options = trainer_pb2.LossOptions.SingleLossOptions( + name='softmax_cross_entropy') + center_loss_options = trainer_pb2.LossOptions.SingleLossOptions( + name='not_a_loss', weight=1.0) + regression_loss_options = trainer_pb2.LossOptions.SingleLossOptions( + name='l1', weight=1.0) + motion_loss_options = trainer_pb2.LossOptions.SingleLossOptions( + name='l1', weight=1.0) + loss_options = trainer_pb2.LossOptions( + semantic_loss=semantic_loss_options, + center_loss=center_loss_options, + regression_loss=regression_loss_options, + motion_loss=motion_loss_options) + + with self.assertRaises(ValueError): + _ = loss.DeepLabFamilyLoss(loss_options, + num_classes=19, + ignore_label=255, + thing_class_ids=tuple(range(11, 19))) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/loss/matchers_ops.py b/model/loss/matchers_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..4273e94476d7d7859797c8dfc1ace1ffe100f892 --- /dev/null +++ b/model/loss/matchers_ops.py @@ -0,0 +1,495 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tensorflow implementation to solve the Linear Sum Assignment problem. + +The Linear Sum Assignment problem involves determining the minimum weight +matching for bipartite graphs. For example, this problem can be defined by +a 2D matrix C, where each element i,j determines the cost of matching worker i +with job j. The solution to the problem is a complete assignment of jobs to +workers, such that no job is assigned to more than one work and no worker is +assigned more than one job, with minimum cost. + +This implementation is designed to be used with tf.compat.v2 to be compatible +with the rest of the DeepLab2 library. It builds off of the Hungarian Matching +Algorithm (https://www.cse.ust.hk/~golin/COMP572/Notes/Matching.pdf), the +original Lingvo tensorflow implementation by Jiquan Ngiam, and the modified TF1 +version by Amil Merchant. +""" + +import tensorflow as tf + + +def _prepare(weights): + """Prepare the cost matrix. + + To speed up computational efficiency of the algorithm, all weights are shifted + to be non-negative. Each element is reduced by the row / column minimum. Note + that neither operation will effect the resulting solution but will provide + a better starting point for the greedy assignment. Note this corresponds to + the pre-processing and step 1 of the Hungarian algorithm from Wikipedia. + + Args: + weights: A float32 [batch_size, num_elems, num_elems] tensor, where each + inner matrix represents weights to be use for matching. + + Returns: + A prepared weights tensor of the same shape and dtype. + """ + # Since every worker needs a job and every job needs a worker, we can subtract + # the minimum from each. + weights -= tf.reduce_min(weights, axis=2, keepdims=True) + weights -= tf.reduce_min(weights, axis=1, keepdims=True) + return weights + + +def _greedy_assignment(adj_matrix): + """Greedily assigns workers to jobs based on an adjaceny matrix. + + Starting with an adjacency matrix representing the available connections + in the bi-partite graph, this function greedily chooses elements such + that each worker is matched to at most one job (or each job is assigned to + at most one worker). Note, if the adjacency matrix has no available values + for a particular row/column, the corresponding job/worker may go unassigned. + + Args: + adj_matrix: A bool [batch_size, num_elems, num_elems] tensor, where each + element of the inner matrix represents whether the worker (row) can be + matched to the job (column). + + Returns: + A bool [batch_size, num_elems, num_elems] tensor, where each element of the + inner matrix represents whether the worker has been matched to the job. + Each row and column can have at most one true element. Some of the rows + and columns may not be matched. + """ + _, num_elems, _ = get_shape_list(adj_matrix, expected_rank=3) + adj_matrix = tf.transpose(adj_matrix, [1, 0, 2]) + + # Create a dynamic TensorArray containing the assignments for each worker/job + assignment = tf.TensorArray(tf.bool, num_elems) + + # Store the elements assigned to each column to update each iteration + col_assigned = tf.zeros_like(adj_matrix[0, ...], dtype=tf.bool) + + # Iteratively assign each row using tf.foldl. Intuitively, this is a loop + # over rows, where we incrementally assign each row. + def _assign_row(accumulator, row_adj): + # The accumulator tracks the row assignment index. + idx, assignment, col_assigned = accumulator + + # Viable candidates cannot already be assigned to another job. + candidates = row_adj & (~col_assigned) + + # Deterministically assign to the candidates of the highest index count. + max_candidate_idx = tf.argmax( + tf.cast(candidates, tf.int32), axis=1, output_type=tf.int32) + + candidates_indicator = tf.one_hot( + max_candidate_idx, + num_elems, + on_value=True, + off_value=False, + dtype=tf.bool) + candidates_indicator &= candidates + + # Make assignment to the column. + col_assigned |= candidates_indicator + assignment = assignment.write(idx, candidates_indicator) + + return idx + 1, assignment, col_assigned + + _, assignment, _ = tf.foldl( + _assign_row, adj_matrix, (0, assignment, col_assigned), back_prop=False) + + assignment = assignment.stack() + assignment = tf.transpose(assignment, [1, 0, 2]) + return assignment + + +def _find_augmenting_path(assignment, adj_matrix): + """Finds an augmenting path given an assignment and an adjacency matrix. + + The augmenting path search starts from the unassigned workers, then goes on + to find jobs (via an unassigned pairing), then back again to workers (via an + existing pairing), and so on. The path alternates between unassigned and + existing pairings. Returns the state after the search. + + Note: In the state the worker and job, indices are 1-indexed so that we can + use 0 to represent unreachable nodes. State contains the following keys: + + - jobs: A [batch_size, 1, num_elems] tensor containing the highest index + unassigned worker that can reach this job through a path. + - jobs_from_worker: A [batch_size, num_elems] tensor containing the worker + reached immediately before this job. + - workers: A [batch_size, num_elems, 1] tensor containing the highest index + unassigned worker that can reach this worker through a path. + - workers_from_job: A [batch_size, num_elems] tensor containing the job + reached immediately before this worker. + - new_jobs: A bool [batch_size, num_elems] tensor containing True if the + unassigned job can be reached via a path. + + State can be used to recover the path via backtracking. + + Args: + assignment: A bool [batch_size, num_elems, num_elems] tensor, where each + element of the inner matrix represents whether the worker has been matched + to the job. This may be a partial assignment. + adj_matrix: A bool [batch_size, num_elems, num_elems] tensor, where each + element of the inner matrix represents whether the worker (row) can be + matched to the job (column). + + Returns: + A state dict, which represents the outcome of running an augmenting + path search on the graph given the assignment. + """ + batch_size, num_elems, _ = get_shape_list(assignment, expected_rank=3) + unassigned_workers = ~tf.reduce_any(assignment, axis=2, keepdims=True) + unassigned_jobs = ~tf.reduce_any(assignment, axis=1, keepdims=True) + + unassigned_pairings = tf.cast(adj_matrix & ~assignment, tf.int32) + existing_pairings = tf.cast(assignment, tf.int32) + + # Initialize unassigned workers to have non-zero ids, assigned workers will + # have ids = 0. + worker_indices = tf.range(1, num_elems + 1, dtype=tf.int32) + init_workers = tf.tile(worker_indices[tf.newaxis, :, tf.newaxis], + [batch_size, 1, 1]) + init_workers *= tf.cast(unassigned_workers, tf.int32) + + state = { + "jobs": tf.zeros((batch_size, 1, num_elems), dtype=tf.int32), + "jobs_from_worker": tf.zeros((batch_size, num_elems), dtype=tf.int32), + "workers": init_workers, + "workers_from_job": tf.zeros((batch_size, num_elems), dtype=tf.int32) + } + + def _has_active_workers(state, curr_workers): + """Check if there are still active workers.""" + del state + return tf.reduce_sum(curr_workers) > 0 + + def _augment_step(state, curr_workers): + """Performs one search step.""" + + # Note: These steps could be potentially much faster if sparse matrices are + # supported. The unassigned_pairings and existing_pairings matrices can be + # very sparse. + + # Find potential jobs using current workers. + potential_jobs = curr_workers * unassigned_pairings + curr_jobs = tf.reduce_max(potential_jobs, axis=1, keepdims=True) + curr_jobs_from_worker = 1 + tf.argmax( + potential_jobs, axis=1, output_type=tf.int32) + + # Remove already accessible jobs from curr_jobs. + default_jobs = tf.zeros_like(state["jobs"], dtype=state["jobs"].dtype) + curr_jobs = tf.where(state["jobs"] > 0, default_jobs, curr_jobs) + curr_jobs_from_worker *= tf.cast(curr_jobs > 0, tf.int32)[:, 0, :] + + # Find potential workers from current jobs. + potential_workers = curr_jobs * existing_pairings + curr_workers = tf.reduce_max(potential_workers, axis=2, keepdims=True) + curr_workers_from_job = 1 + tf.argmax( + potential_workers, axis=2, output_type=tf.int32) + + # Remove already accessible workers from curr_workers. + default_workers = tf.zeros_like(state["workers"]) + curr_workers = tf.where( + state["workers"] > 0, default_workers, curr_workers) + curr_workers_from_job *= tf.cast(curr_workers > 0, tf.int32)[:, :, 0] + + # Update state so that we can backtrack later. + state = state.copy() + state["jobs"] = tf.maximum(state["jobs"], curr_jobs) + state["jobs_from_worker"] = tf.maximum(state["jobs_from_worker"], + curr_jobs_from_worker) + state["workers"] = tf.maximum(state["workers"], curr_workers) + state["workers_from_job"] = tf.maximum(state["workers_from_job"], + curr_workers_from_job) + + return state, curr_workers + + with tf.name_scope("find_augmenting_path"): + state, _ = tf.while_loop( + _has_active_workers, + _augment_step, (state, init_workers), + back_prop=False) + + # Compute new jobs, this is useful for determnining termnination of the + # maximum bi-partite matching and initialization for backtracking. + new_jobs = (state["jobs"] > 0) & unassigned_jobs + state["new_jobs"] = new_jobs[:, 0, :] + return state + + +def _improve_assignment(assignment, state): + """Improves an assignment by backtracking the augmented path using state. + + Args: + assignment: A bool [batch_size, num_elems, num_elems] tensor, where each + element of the inner matrix represents whether the worker has been matched + to the job. This may be a partial assignment. + state: A dict, which represents the outcome of running an augmenting path + search on the graph given the assignment. + + Returns: + A new assignment matrix of the same shape and type as assignment, where the + assignment has been updated using the augmented path found. + """ + batch_size, num_elems, _ = get_shape_list(assignment, 3) + + # We store the current job id and iteratively backtrack using jobs_from_worker + # and workers_from_job until we reach an unassigned worker. We flip all the + # assignments on this path to discover a better overall assignment. + + # Note: The indices in state are 1-indexed, where 0 represents that the + # worker / job cannot be reached. + + # Obtain initial job indices based on new_jobs. + curr_job_idx = tf.argmax( + tf.cast(state["new_jobs"], tf.int32), axis=1, output_type=tf.int32) + + # Track whether an example is actively being backtracked. Since we are + # operating on a batch, not all examples in the batch may be active. + active = tf.gather(state["new_jobs"], curr_job_idx, batch_dims=1) + batch_range = tf.range(0, batch_size, dtype=tf.int32) + + # Flip matrix tracks which assignments we need to flip - corresponding to the + # augmenting path taken. We use an integer tensor here so that we can use + # tensor_scatter_nd_add to update the tensor, and then cast it back to bool + # after the loop. + flip_matrix = tf.zeros((batch_size, num_elems, num_elems), dtype=tf.int32) + + def _has_active_backtracks(flip_matrix, active, curr_job_idx): + """Check if there are still active workers.""" + del flip_matrix, curr_job_idx + return tf.reduce_any(active) + + def _backtrack_one_step(flip_matrix, active, curr_job_idx): + """Take one step in backtracking.""" + # Discover the worker that the job originated from, note that this worker + # must exist by construction. + curr_worker_idx = tf.gather( + state["jobs_from_worker"], curr_job_idx, batch_dims=1) - 1 + curr_worker_idx = tf.maximum(curr_worker_idx, 0) + update_indices = tf.stack([batch_range, curr_worker_idx, curr_job_idx], + axis=1) + update_indices = tf.maximum(update_indices, 0) + flip_matrix = tf.tensor_scatter_nd_add(flip_matrix, update_indices, + tf.cast(active, tf.int32)) + + # Discover the (potential) job that the worker originated from. + curr_job_idx = tf.gather( + state["workers_from_job"], curr_worker_idx, batch_dims=1) - 1 + # Note that jobs may not be active, and we track that here (before + # adjusting indices so that they are all >= 0 for gather). + active &= curr_job_idx >= 0 + curr_job_idx = tf.maximum(curr_job_idx, 0) + update_indices = tf.stack([batch_range, curr_worker_idx, curr_job_idx], + axis=1) + update_indices = tf.maximum(update_indices, 0) + flip_matrix = tf.tensor_scatter_nd_add(flip_matrix, update_indices, + tf.cast(active, tf.int32)) + + return flip_matrix, active, curr_job_idx + + with tf.name_scope("improve_assignment"): + flip_matrix, _, _ = tf.while_loop( + _has_active_backtracks, + _backtrack_one_step, (flip_matrix, active, curr_job_idx), + back_prop=False) + + flip_matrix = tf.cast(flip_matrix, tf.bool) + assignment = tf.math.logical_xor(assignment, flip_matrix) + + return assignment + + +def _maximum_bipartite_matching(adj_matrix, assignment=None): + """Performs maximum bipartite matching using augmented paths. + + Args: + adj_matrix: A bool [batch_size, num_elems, num_elems] tensor, where each + element of the inner matrix represents whether the worker (row) can be + matched to the job (column). + assignment: An optional bool [batch_size, num_elems, num_elems] tensor, + where each element of the inner matrix represents whether the worker has + been matched to the job. This may be a partial assignment. If specified, + this assignment will be used to seed the iterative algorithm. + + Returns: + A state dict representing the final augmenting path state search, and + a maximum bipartite matching assignment tensor. Note that the state outcome + can be used to compute a minimum vertex cover for the bipartite graph. + """ + + if assignment is None: + assignment = _greedy_assignment(adj_matrix) + + state = _find_augmenting_path(assignment, adj_matrix) + + def _has_new_jobs(state, assignment): + del assignment + return tf.reduce_any(state["new_jobs"]) + + def _improve_assignment_and_find_new_path(state, assignment): + assignment = _improve_assignment(assignment, state) + state = _find_augmenting_path(assignment, adj_matrix) + return state, assignment + + with tf.name_scope("maximum_bipartite_matching"): + state, assignment = tf.while_loop( + _has_new_jobs, + _improve_assignment_and_find_new_path, (state, assignment), + back_prop=False) + + return state, assignment + + +def _compute_cover(state, assignment): + """Computes a cover for the bipartite graph. + + We compute a cover using the construction provided at + https://en.wikipedia.org/wiki/K%C5%91nig%27s_theorem_(graph_theory)#Proof + which uses the outcome from the alternating path search. + + Args: + state: A state dict, which represents the outcome of running an augmenting + path search on the graph given the assignment. + assignment: An optional bool [batch_size, num_elems, num_elems] tensor, + where each element of the inner matrix represents whether the worker has + been matched to the job. This may be a partial assignment. If specified, + this assignment will be used to seed the iterative algorithm. + + Returns: + A tuple of (workers_cover, jobs_cover) corresponding to row and column + covers for the bipartite graph. workers_cover is a boolean tensor of shape + [batch_size, num_elems, 1] and jobs_cover is a boolean tensor of shape + [batch_size, 1, num_elems]. + """ + assigned_workers = tf.reduce_any(assignment, axis=2, keepdims=True) + assigned_jobs = tf.reduce_any(assignment, axis=1, keepdims=True) + + reachable_workers = state["workers"] > 0 + reachable_jobs = state["jobs"] > 0 + + workers_cover = assigned_workers & (~reachable_workers) + jobs_cover = assigned_jobs & reachable_jobs + + return workers_cover, jobs_cover + + +def _update_weights_using_cover(workers_cover, jobs_cover, weights): + """Updates weights for hungarian matching using a cover. + + We first find the minimum uncovered weight. Then, we subtract this from all + the uncovered weights, and add it to all the doubly covered weights. + + Args: + workers_cover: A boolean tensor of shape [batch_size, num_elems, 1]. + jobs_cover: A boolean tensor of shape [batch_size, 1, num_elems]. + weights: A float32 [batch_size, num_elems, num_elems] tensor, where each + inner matrix represents weights to be use for matching. + + Returns: + A new weight matrix with elements adjusted by the cover. + """ + max_value = tf.reduce_max(weights) + + covered = workers_cover | jobs_cover + double_covered = workers_cover & jobs_cover + + uncovered_weights = tf.where(covered, + tf.ones_like(weights) * max_value, weights) + min_weight = tf.reduce_min(uncovered_weights, axis=[-2, -1], keepdims=True) + + add_weight = tf.where(double_covered, + tf.ones_like(weights) * min_weight, + tf.zeros_like(weights)) + sub_weight = tf.where(covered, tf.zeros_like(weights), + tf.ones_like(weights) * min_weight) + + return weights + add_weight - sub_weight + + +def get_shape_list(tensor, expected_rank=None): + """Returns a list of the shape of tensor. + + Args: + tensor: A tf.Tensor object to find the shape of + expected_rank: An (optional) int with the expected rank of the inputted + tensor. + + Returns: + A list representing the shape of the tesnor. + + Raises: + ValueError: If the expected rank does not match the expected rank of the + inputted tensor. + """ + actual_rank = tensor.shape.ndims + + if expected_rank and actual_rank != expected_rank: + raise ValueError("The tensor has rank %d which is not equal to the " + "expected rank %d" % (actual_rank, expected_rank)) + + shape = tensor.shape.as_list() + dynamic = tf.shape(tensor) + output = [dim if dim else dynamic[ind] for ind, dim in enumerate(shape)] + return output + + +def hungarian_matching(weights): + """Computes the minimum linear sum assignment using the Hungarian algorithm. + + Args: + weights: A float32 [batch_size, num_elems, num_elems] tensor, where each + inner matrix represents weights to be use for matching. + + Returns: + A bool [batch_size, num_elems, num_elems] tensor, where each element of the + inner matrix represents whether the worker has been matched to the job. + The returned matching will always be a perfect match. + """ + batch_size, num_elems, _ = get_shape_list(weights, 3) + + weights = _prepare(weights) + adj_matrix = tf.equal(weights, 0.) + state, assignment = _maximum_bipartite_matching(adj_matrix) + workers_cover, jobs_cover = _compute_cover(state, assignment) + + def _cover_incomplete(workers_cover, jobs_cover, *args): + del args + cover_sum = ( + tf.reduce_sum(tf.cast(workers_cover, tf.int32)) + + tf.reduce_sum(tf.cast(jobs_cover, tf.int32))) + return tf.less(cover_sum, batch_size * num_elems) + + def _update_weights_and_match(workers_cover, jobs_cover, weights, assignment): + weights = _update_weights_using_cover(workers_cover, jobs_cover, weights) + adj_matrix = tf.equal(weights, 0.) + state, assignment = _maximum_bipartite_matching(adj_matrix, assignment) + workers_cover, jobs_cover = _compute_cover(state, assignment) + return workers_cover, jobs_cover, weights, assignment + + with tf.name_scope("hungarian_matching"): + workers_cover, jobs_cover, weights, assignment = tf.while_loop( + _cover_incomplete, + _update_weights_and_match, + (workers_cover, jobs_cover, weights, assignment), + back_prop=False) + + return assignment diff --git a/model/loss/matchers_ops_test.py b/model/loss/matchers_ops_test.py new file mode 100644 index 0000000000000000000000000000000000000000..6e453a12329a9ac79b9f24399fa8f7e2e047e29c --- /dev/null +++ b/model/loss/matchers_ops_test.py @@ -0,0 +1,136 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for matchers_ops.""" + +import numpy as np +from scipy import optimize +import tensorflow as tf + +from deeplab2.model.loss import matchers_ops + + +class MatchersOpsTest(tf.test.TestCase): + + def hungarian_matching_tpu(self, cost_matrix): + resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='') + tf.config.experimental_connect_to_cluster(resolver) + tf.tpu.experimental.initialize_tpu_system(resolver) + strategy = tf.distribute.TPUStrategy(resolver) + + @tf.function + def function(): + costs = tf.constant(cost_matrix, cost_matrix.dtype, cost_matrix.shape) + return matchers_ops.hungarian_matching(costs) + # Get the first replica output. + return strategy.run(function).values[0].numpy() + + def testLinearSumAssignment(self): + """Check a simple 2D test case of the Linear Sum Assignment problem. + + Ensures that the implementation of the matching algorithm is correct + and functional on TPUs. + """ + cost_matrix = np.array([[[4, 1, 3], [2, 0, 5], [3, 2, 2]]], + dtype=np.float32) + adjacency_output = self.hungarian_matching_tpu(cost_matrix) + + correct_output = np.array([ + [0, 1, 0], + [1, 0, 0], + [0, 0, 1], + ], dtype=bool) + self.assertAllEqual(adjacency_output[0], correct_output) + + def testBatchedLinearSumAssignment(self): + """Check a batched case of the Linear Sum Assignment Problem. + + Ensures that a correct solution is found for all inputted problems within + a batch. + """ + cost_matrix = np.array([ + [[4, 1, 3], [2, 0, 5], [3, 2, 2]], + [[1, 4, 3], [0, 2, 5], [2, 3, 2]], + [[1, 3, 4], [0, 5, 2], [2, 2, 3]], + ], + dtype=np.float32) + + adjacency_output = self.hungarian_matching_tpu(cost_matrix) + + # Hand solved correct output for the linear sum assignment problem + correct_output = np.array([ + [[0, 1, 0], [1, 0, 0], [0, 0, 1]], + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], + [[1, 0, 0], [0, 0, 1], [0, 1, 0]], + ], + dtype=bool) + self.assertAllClose(adjacency_output, correct_output) + + def testMaximumBipartiteMatching(self): + """Check that the maximum bipartite match assigns the correct numbers.""" + adj_matrix = tf.cast([[ + [1, 0, 0, 0, 1], + [0, 1, 0, 1, 0], + [0, 0, 1, 0, 0], + [0, 1, 0, 0, 0], + [1, 0, 0, 0, 0], + ]], tf.bool) # pyformat: disable + _, assignment = matchers_ops._maximum_bipartite_matching(adj_matrix) + self.assertEqual(np.sum(assignment), 5) + + def testAssignmentMatchesScipy(self): + """Check that the Linear Sum Assignment matches the Scipy implementation.""" + batch_size, num_elems = 2, 25 + weights = tf.random.uniform((batch_size, num_elems, num_elems), + minval=0., + maxval=1.) + assignment = matchers_ops.hungarian_matching(weights) + actual_weights = weights.numpy() + actual_assignment = assignment.numpy() + + for idx in range(batch_size): + _, scipy_assignment = optimize.linear_sum_assignment(actual_weights[idx]) + hungarian_assignment = np.where(actual_assignment[idx])[1] + + self.assertAllEqual(hungarian_assignment, scipy_assignment) + + def testAssignmentRunsOnTPU(self): + """Check that a batch of assignments matches Scipy.""" + batch_size, num_elems = 4, 100 + cost_matrix = np.random.rand(batch_size, num_elems, num_elems) + + actual_assignment = self.hungarian_matching_tpu(cost_matrix) + + for idx in range(batch_size): + _, scipy_assignment = optimize.linear_sum_assignment(cost_matrix[idx]) + hungarian_assignment = np.where(actual_assignment[idx])[1] + self.assertAllEqual(hungarian_assignment, scipy_assignment) + + def testLargeBatch(self): + """Check large-batch performance of Hungarian matcher. + + Useful for testing efficiency of the proposed solution and regression + testing. Current solution is thought to be quadratic in nature, yielding + significant slowdowns when the number of queries is increased. + """ + batch_size, num_elems = 64, 100 + cost_matrix = np.abs( + np.random.normal(size=(batch_size, num_elems, num_elems))) + + _ = self.hungarian_matching_tpu(cost_matrix) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/loss/max_deeplab_loss.py b/model/loss/max_deeplab_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..67368c5a69b4c9b6e871fb4ccded7cb7a502a762 --- /dev/null +++ b/model/loss/max_deeplab_loss.py @@ -0,0 +1,721 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains the loss functions for MaX-DeepLab models. + +Reference: + MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. +""" +from typing import Text, Dict, Tuple, List + +import tensorflow as tf +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.model import utils +from deeplab2.model.loss import base_loss +from deeplab2.model.loss import matchers_ops + +# Positive and negative constants that are used to pad or mask hungarian +# matching weights. +_MATCHING_NEGATIVE_CONSTANT = -999.0 +_MATCHING_POSITIVE_CONSTANT = 999.0 +# A large negative constant applied before softmax. This will make the softmax +# ignore the masked logits. +_SOFTMAX_MASKING_CONSTANT = -99999.0 + +_GT_KEY = 'gt_key' +_PRED_KEY = 'pred_key' +_WEIGHT_KEY = 'weight_key' + + +def _generate_mask_slot_semantic_one_hot( + matched_mask_slot_indices: tf.Tensor, + mask_gt_semantic_map: tf.Tensor, + num_mask_slots: int, + thing_stuff_class_ids: List[int]): + """Generates the ground truth for transformer_class_logits. + + This function generates a pseudo ground truth that we will use to train the + transformer class head logits. The input tensors, matched_mask_slot_indices + and mask_gt_semantic_map, are obtained by (hungarian) matching the ground + truth masks with the predicted masks. Note that this function generates the + positive one hot encodings only, i.e., the void class is not included in the + output tensor but will be generated outside the function. + + Args: + matched_mask_slot_indices: An int32 tf.Tensor of shape [batch_size, + num_ground_truth_masks] that encodes the matched mask slot id for each + ground truth mask. + mask_gt_semantic_map: An int32 tf.Tensor of shape [batch_size, + num_ground_truth_masks] that encodes the semantic label for each ground + truth mask. A padded mask (or void, or no object) will have the label -1. + num_mask_slots: An integer, the number of mask slots for the MaX-DeepLab + model. + thing_stuff_class_ids: A list of integers of length [num_thing_classes + + num_stuff_classes] that encodes the class IDs for all thing and stuff + classes. It is a concatenation of the thing_class_ids list and the + stuff_class_ids list. + + Returns: + mask_slot_semantic_one_hot: An output tf.Tensor with shape [batch_size, + num_mask_slots, num_thing_classes + num_stuff_classes]. + """ + semantic_map_shape = mask_gt_semantic_map.get_shape().as_list() + batch_size = semantic_map_shape[0] + num_ground_truth_masks = semantic_map_shape[-1] + + # Concatenate the indices in each dimension of the ground truth one hot + # output. + batch_indices = tf.expand_dims(tf.range(batch_size), axis=-1) + batch_indices = tf.tile(batch_indices, [1, num_ground_truth_masks]) + batch_indices = tf.reshape(batch_indices, [-1, 1]) + matched_mask_slot_indices = tf.reshape(matched_mask_slot_indices, [-1, 1]) + # We shift the semantic map by one so that void labels (-1) will be a valid + # index too. Otherwise, tf.scatter_nd raises error if it runs on CPU. + semantic_indices = tf.reshape(mask_gt_semantic_map, [-1, 1]) + 1 + indices = tf.concat([batch_indices, + matched_mask_slot_indices, + semantic_indices], axis=-1) + + # Generate mask_slot_semantic_one_hot by scattering constant ones onto a + # constant zero tensor. + updates = tf.ones([batch_size * num_ground_truth_masks], dtype=tf.float32) + mask_slot_semantic_one_hot = tf.scatter_nd( + indices, updates, + shape=[batch_size, num_mask_slots, max(thing_stuff_class_ids) + 2]) + + # Gather the wanted classes in the desired (thing + stuff) order. + thing_stuff_tensor = tf.cast(thing_stuff_class_ids, tf.int32) + # We also shift the thing_stuff_tensor index by one in order to revert the + # semantic map shifting above. + mask_slot_semantic_one_hot = tf.gather(mask_slot_semantic_one_hot, + thing_stuff_tensor + 1, axis=2) + return mask_slot_semantic_one_hot + + +def nonsquare_hungarian_matching( + weights: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: + """Hungarian matching with arbitrary shape. + + The matchers_ops.hungarian_matching supports only squared weight matrices. + This function generalizes the hungarian matching to nonsquare cases by padding + the weights to a square and running the square version matching. The property + of hungarian matching ensures that the solutions are equivalent for the padded + square problem and the original nonsquare problem. + + Args: + weights: A [batch, shape1, shape2] float32 tf.Tensor. + + Returns: + square_permutation: A [batch, max(shape1, shape2), max(shape1, shape2)] + float32 tf.Tensor that is the permutation matrix that achieves the minimum + total weight. Note that a permutation matrix contains only value 0.0 and + 1.0, with each row and each column sums to 1.0. + nonsquare_permutation: A [batch, shape1, shape2] float32 tf.Tensor. The + nonsquare part of the permutation matrix. + """ + _, height, width = weights.get_shape().as_list() + max_height_width = max(height, width) + # Padding a constant on one axis does not affect matching results. + weights = tf.pad(weights, + [[0, 0], # Do not pad the batch dimension. + [0, max_height_width - height], + [0, max_height_width - width]], + constant_values=_MATCHING_NEGATIVE_CONSTANT) + square_permutation = matchers_ops.hungarian_matching(weights) + + square_permutation = tf.cast(square_permutation, tf.float32) + return square_permutation, square_permutation[:, :height, :width] + + +def _mask_similarity(gt_mask: tf.Tensor, pred_mask: tf.Tensor, + metric: str = 'dice') -> tf.Tensor: + """Computes mask similarity between gt_masks and pred_masks. + + Args: + gt_mask: A [batch, height * width, num_gt_masks] float32 tf.Tensor, that + contains only value 0.0 and 1.0. Each 1.0 indicates that the pixel belongs + to the ground truth mask. Note that panoptic segmentation enforces that + ground truth masks do not overlap. + pred_mask: A [batch, height * width, num_pred_masks] float32 tf.Tensor, that + is positive. For each batch_id and pixel_id, the [num_pred_masks] vector + encodes whether each pixel belongs to each mask. The sum of each vector is + less than or equal to one. + metric: A string, the mask similarity metric that we will compute. Supports + 'dice' (default), 'iou', 'intersection_over_ground_truth', and + 'intersection_over_prediction'. + + Returns: + mask_similarity: A float32 [batch, num_gt_masks, num_pred_masks] tf.Tensor + that contains the mask similarity between all ground truth masks and all + predicted masks. + + Raises: + ValueError: If the mask similarity metric is not one of 'dice', 'iou', + 'intersection_over_ground_truth', or 'intersection_over_prediction'. + """ + denominator_epsilon = 1e-5 + intersection = tf.einsum('bpi,bpj->bij', gt_mask, pred_mask) + if metric.lower() == 'dice': + denominator = (tf.expand_dims(tf.reduce_sum(gt_mask, axis=1), axis=2) + + tf.reduce_sum(pred_mask, axis=1, keepdims=True)) / 2 + elif metric.lower() == 'iou': + denominator = (tf.expand_dims(tf.reduce_sum(gt_mask, axis=1), axis=2) + + tf.reduce_sum(pred_mask, axis=1, keepdims=True) - + intersection) + elif metric.lower() == 'intersection_over_ground_truth': + denominator = tf.expand_dims(tf.reduce_sum(gt_mask, axis=1), axis=2) + elif metric.lower() == 'intersection_over_prediction': + denominator = tf.reduce_sum(pred_mask, axis=1, keepdims=True) + else: + raise ValueError('The mask similarity metric is not supported.') + return intersection / (denominator + denominator_epsilon) + + +class MaXDeepLabLoss(tf.keras.layers.Layer): + """This class contains code for MaX-DeepLab losses.""" + + def __init__(self, + loss_options: config_pb2.LossOptions, + ignore_label: int, + thing_class_ids: Tuple[int], + focal_loss_alpha: float = 0.75, + instance_discrimination_temperature: float = 0.3): + """Initializes a MaX-DeepLab loss. + + This class supports PQ-style loss, mask id cross entropy loss, and instance + discrimination loss, proposed in MaX-DeepLab. The PQ-style loss can be + further decomposed in to a classification term and a mask dice term. + + Reference: + MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", + CVPR 2021. https://arxiv.org/abs/2012.00759 + Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. + + Args: + loss_options: Loss options as defined by config_pb2.LossOptions. + ignore_label: An integer specifying the ignore label. + thing_class_ids: A tuple of length [N] containing N thing indices. + focal_loss_alpha: An optional float specifying the coefficient that + weights between positive (matched) and negative (unmatched) masks in + focal loss. The positives are weighted by alpha, while the negatives + are weighted by (1. - alpha). Note that we do not use a focal loss + gamma here, i.e., the gamma is set to zero which is equivalent to the + normal cross-entropy loss, except for the alpha weighting. Default to + 0.75. + instance_discrimination_temperature: An optional float specifying the + temperature for the instance discrimination loss. + """ + super(MaXDeepLabLoss, self).__init__(name='MaXDeepLabLoss') + # The loss_terms will optionally include + # - common.PQ_STYLE_LOSS_CLASS_TERM + # - common.PQ_STYLE_LOSS_MASK_DICE_TERM + # - common.MASK_ID_CROSS_ENTROPY_LOSS + # - common.INSTANCE_DISCRIMINATION_LOSS + # These loss terms will be accessed by loss_builder.py and will be used to + # build loss metrics. + self.loss_terms = [] + + # The PQ-style loss includes two terms. + self._pq_style_loss_weight = 0.0 + if loss_options.HasField(common.PQ_STYLE_LOSS): + self._pq_style_loss_weight = loss_options.pq_style_loss.weight + self.loss_terms.append(common.PQ_STYLE_LOSS_CLASS_TERM) + self.loss_terms.append(common.PQ_STYLE_LOSS_MASK_DICE_TERM) + + # Mask-ID cross entropy loss. + self._mask_id_cross_entropy_loss_weight = 0.0 + if loss_options.HasField(common.MASK_ID_CROSS_ENTROPY_LOSS): + self._mask_id_cross_entropy_loss_weight = ( + loss_options.mask_id_cross_entropy_loss.weight) + self.loss_terms.append(common.MASK_ID_CROSS_ENTROPY_LOSS) + + # Instance discrimination loss. + self._instance_discrimination_loss_weight = 0.0 + if loss_options.HasField(common.INSTANCE_DISCRIMINATION_LOSS): + self._instance_discrimination_loss_weight = ( + loss_options.instance_discrimination_loss.weight) + self.loss_terms.append(common.INSTANCE_DISCRIMINATION_LOSS) + + self._ignore_label = ignore_label + self._thing_class_ids = list(thing_class_ids) + self._focal_loss_alpha = focal_loss_alpha + self._instance_discrimination_temperature = ( + instance_discrimination_temperature) + + # Build the base loss functions. + self._pq_style_loss_class_term = base_loss.FocalCrossEntropyLoss( + gt_key=_GT_KEY, pred_key=_PRED_KEY, weight_key=_WEIGHT_KEY, + # Num_classes and ignore_label are not necessary since the inputs will + # be one hot encoded already. + num_classes=None, ignore_label=None, + focal_loss_alpha=focal_loss_alpha, + focal_loss_gamma=0.0, background_channel_index=-1, + dynamic_weight=True) + self._pq_style_loss_mask_dice_term = base_loss.MaskDiceLoss( + gt_key=_GT_KEY, pred_key=_PRED_KEY, weight_key=_WEIGHT_KEY, + prediction_activation='softmax') + self._mask_id_cross_entropy_loss = base_loss.TopKCrossEntropyLoss( + gt_key=_GT_KEY, pred_key=_PRED_KEY, weight_key=_WEIGHT_KEY, + # Num_classes and ignore_label are not necessary since the inputs will + # be one hot encoded already. + num_classes=None, ignore_label=None, + top_k_percent_pixels=1.0, dynamic_weight=True) + self._instance_discrimination_loss = base_loss.TopKCrossEntropyLoss( + gt_key=_GT_KEY, pred_key=_PRED_KEY, weight_key=_WEIGHT_KEY, + # Num_classes and ignore_label are not necessary since the inputs will + # be one hot encoded already. + num_classes=None, ignore_label=None, + top_k_percent_pixels=1.0, dynamic_weight=True) + + def build(self, + input_shapes: Tuple[Dict[Text, tf.Tensor], Dict[Text, tf.Tensor]]): + """Extracts useful constants that depend on the input shapes.""" + y_true_shapes = input_shapes[0] + self._max_thing_id = int(y_true_shapes[common.GT_THING_ID_CLASS_KEY][-1]) + y_pred_shapes = input_shapes[1] + transformer_class_logits_shape = y_pred_shapes[ + common.PRED_TRANSFORMER_CLASS_LOGITS_KEY] + self._num_mask_slots = int(transformer_class_logits_shape[1]) + # The transformer_class_logits contain thing classes, stuff classes, and the + # void class, so num_thing_stuff_classes should be the total number of + # classes minus one. + self._num_thing_stuff_classes = int(transformer_class_logits_shape[2]) - 1 + # Since we implement the PQ-style loss with the class term plus the mask + # dice term (Equation 10 of the paper), we need to balance the two terms to + # have the same weight and normalizating constants. The focal loss alpha is + # a weight on the positive class term, so we apply it to the mask dice term + # too. The class loss is also normalized by the number of mask slots, so we + # do the same normalization for the mask dice term. + self._mask_dice_term_modifier = ( + self._focal_loss_alpha / self._num_mask_slots) + + self._stuff_class_ids = utils.get_stuff_class_ids( + self._num_thing_stuff_classes, + self._thing_class_ids, + self._ignore_label) + self._num_stuff_classes = len(self._stuff_class_ids) + self._thing_stuff_class_ids = self._thing_class_ids + self._stuff_class_ids + self._pixel_gt_num_mask_id = self._max_thing_id + self._num_stuff_classes + + def _pre_process_ground_truth( + self, y_true: Dict[Text, tf.Tensor], output_height: int, output_width: int + ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, + tf.Tensor]: + """Pre-processes the ground truth before we compute the losses. + + This function generates tensors that do not depend on the prediction of the + model, but are useful to the calculation of the losses. The function mainly + downsamples the pixel space ground truth to the model output resolution, and + combines (or concatenates) the thing masks and the stuff masks. The output + shape pixel_gt_num_mask_id = max_thing_id + num_stuff_classes, which means + the output masks contain both thing masks and stuff masks. + + Args: + y_true: A dict of tensors providing ground-truth information, containing + - common.GT_SEMANTIC_KEY: A [batch, height, width] int32 tf.Tensor, the + semantic label map. + - common.GT_THING_ID_MASK_KEY: A [batch, height, width] int32 tf.Tensor. + It assigns each non-crowd thing instance a unique mask-ID label, + starting from 0. Unassigned pixels are set to -1. + - common.GT_THING_ID_CLASS_KEY: A [batch, max_thing_id] int32 tf.Tensor. + It contains semantic ID of each instance assigned to thing_id_mask. The + remaining (max_thing_id - num_things) elements are set to -1. + output_height: An integer, the height of the model output. + output_width: An integer, the width of the model output. + + Returns: + pixel_gt_thing_mask: A [batch, output_height * output_width] float32 + tensor, with value 0.0 and 1.0 only, indicating whether a pixel belongs + to a 'thing' class. + pixel_gt_non_void_mask: A [batch, output_height * output_width] float32 + tensor, with value 0.0 and 1.0 only, indicating if a pixel does not + belong to the void class. + pixel_gt_mask_id_one_hot: A [batch, output_height * output_width, + pixel_gt_num_mask_id] float32 tensor, with value 0.0 and 1.0 only, + indicating the mask id each pixel belongs to. + mask_gt_semantic_map: A [batch, pixel_gt_num_mask_id] int32 tensor, the + semantic class of each ground truth mask. + mask_gt_non_void_mask: A [batch, pixel_gt_num_mask_id] int32 tensor, with + value 0.0 and 1.0 only, indicating if the ground truth mask is a valid + mask, not a padded mask. The masks are padded because TPU does not + support dynamic shapes except in the batch axis. We pad all ground truth + thing masks to a large enough constant max_thing_id. Similarly, stuff + classes that do not present in the current image will be set to a void + mask too. + mask_gt_semantic_one_hot: A [batch, pixel_gt_num_mask_id, + num_thing_stuff_classes] float32 tensor, with value 0.0 and 1.0 only, + containing the one hot encodings of the ground truth mask classes. The + last dimension contains concatenated thing classes and stuff classes, + which is different from the dataset class IDs in mask_gt_semantic_map. + mask_gt_area: A [batch, pixel_gt_num_mask_id] float32 tensor, the area of + each ground truth mask. Padded masks have an area of 0.0. + """ + # The depth of one hot encoding should be the largest id plus one. For + # example, if we want to one-hot encode a class ID of 133 (the largest ID + # for the COCO dataset), we will need a one-hot encoding of length 134. + one_hot_depth = max(self._thing_stuff_class_ids) + 1 + batch_size = y_true[common.GT_SEMANTIC_KEY].get_shape().as_list()[0] + + # Compute pixel_gt_semantic_map (downsampling and reshaping to the 1D + # representation that will be mainly used in this loss function). + pixel_gt_semantic_map = utils.strided_downsample( + y_true[common.GT_SEMANTIC_KEY], + target_size=[output_height, output_width]) + pixel_gt_semantic_map = tf.reshape( + pixel_gt_semantic_map, + [batch_size, output_height * output_width]) + + # Compute pixel_gt_non_void_mask. + pixel_gt_non_void_mask = tf.cast( + tf.not_equal(pixel_gt_semantic_map, self._ignore_label), tf.float32) + pixel_gt_non_void_mask = tf.ensure_shape( + pixel_gt_non_void_mask, + [batch_size, output_height * output_width]) + + # Compute pixel_gt_semantic_one_hot from pixel_gt_semantic_map in order to + # gather pixel_gt_stuff_id_one_hot from pixel_gt_semantic_one_hot. + pixel_gt_semantic_one_hot = tf.one_hot(pixel_gt_semantic_map, one_hot_depth) + # Convert the one hot encoding from the dataset id order to (thing, stuff) + # order used in MaX-DeepLab. + pixel_gt_stuff_id_one_hot = tf.gather(pixel_gt_semantic_one_hot, + self._stuff_class_ids, axis=-1) + pixel_gt_stuff_id_one_hot = tf.ensure_shape( + pixel_gt_stuff_id_one_hot, + [batch_size, output_height * output_width, self._num_stuff_classes]) + + # Compute pixel_gt_thing_id_one_hot for thing masks. + pixel_gt_thing_id_map = utils.strided_downsample( + y_true[common.GT_THING_ID_MASK_KEY], + target_size=[output_height, output_width]) + pixel_gt_thing_id_map = tf.reshape( + pixel_gt_thing_id_map, shape=[batch_size, output_height * output_width]) + # Note that common.GT_THING_ID_MASK_KEY uses -1 for void masks. And 0 to + # (num_mask_slots - 1) are used for num_mask_slots mask slots. + pixel_gt_thing_mask = tf.cast( + tf.not_equal(pixel_gt_thing_id_map, -1), tf.float32) + pixel_gt_thing_id_one_hot = tf.one_hot(pixel_gt_thing_id_map, + self._max_thing_id) + # Compute pixel_gt_mask_id_one_hot by concatenating thing masks with stuff + # masks. + pixel_gt_mask_id_one_hot = tf.concat([pixel_gt_thing_id_one_hot, + pixel_gt_stuff_id_one_hot], axis=-1) + pixel_gt_mask_id_one_hot = tf.ensure_shape( + pixel_gt_mask_id_one_hot, + [batch_size, output_height * output_width, self._pixel_gt_num_mask_id]) + + # Compute mask_gt_area by summing the one hot encodings spatially. + mask_gt_area = tf.expand_dims( + tf.reduce_sum(pixel_gt_mask_id_one_hot, axis=1), axis=-1) + # Generate a binary mask for ground truth masks, indicating whether each + # ground truth mask exists in the pixel space with a non-zero area. Note + # that a mask that exists in the original input resolution will be removed + # if its area is zero in the output resolution, due to downsampling. + mask_gt_area_mask = tf.reshape(mask_gt_area > 0.5, + [batch_size, self._pixel_gt_num_mask_id]) + + # Compute mask_gt_semantic_map and mask_gt_semantic_one_hot. + thing_id_gt_semantic_map = tf.reshape( + tf.cast(y_true[common.GT_THING_ID_CLASS_KEY], tf.int32), + [batch_size, self._max_thing_id]) + # The stuff ground truth semantic map is just the stuff class IDs. + stuff_id_gt_semantic_map = tf.tile( + tf.reshape( + tf.cast(self._stuff_class_ids, tf.int32), + [1, self._num_stuff_classes]), [batch_size, 1]) + mask_gt_semantic_map = tf.concat( + [thing_id_gt_semantic_map, stuff_id_gt_semantic_map], axis=-1) + # Set masks with zero area to void (-1), which is consistent with the void + # label used in common.GT_THING_ID_CLASS_KEY but is different from the + # ignore_labels of the datasets. + mask_gt_semantic_map = ( + (mask_gt_semantic_map + 1) * tf.cast(mask_gt_area_mask, tf.int32) - 1) + # Void (-1) classes will automatically be ignored by tf.one_hot. + mask_gt_semantic_one_hot = tf.one_hot(mask_gt_semantic_map, one_hot_depth) + mask_gt_semantic_one_hot = tf.gather( + mask_gt_semantic_one_hot, self._thing_stuff_class_ids, axis=-1) + + # Compute mask_gt_non_void_mask. Again, a mask that exists in the original + # input resolution is set to void if its area is zero in the output + # resolution, due to downsampling. + mask_gt_non_void_mask = tf.cast(mask_gt_semantic_map > -1, tf.float32) + mask_gt_non_void_mask = tf.ensure_shape( + mask_gt_non_void_mask, [batch_size, self._pixel_gt_num_mask_id]) + + return (pixel_gt_thing_mask, pixel_gt_non_void_mask, + pixel_gt_mask_id_one_hot, mask_gt_semantic_map, + mask_gt_non_void_mask, mask_gt_semantic_one_hot, mask_gt_area) + + def call( + self, inputs: Tuple[Dict[Text, tf.Tensor], Dict[Text, tf.Tensor]] + ) -> Dict[Text, tf.Tensor]: + """Computes the MaX-DeepLab losses. + + Args: + inputs: A tuple of two dicts (y_true, y_pred): + - y_true: A dict of tensors providing ground-truth information, containing + - common.GT_SEMANTIC_KEY: A [batch, height, width] int32 tf.Tensor, the + semantic label map. + - common.GT_THING_ID_MASK_KEY: A [batch, height, width] int32 + tf.Tensor. It assigns each non-crowd thing instance a unique mask-ID + label, starting from 0. Unassigned pixels are set to -1. + - common.GT_THING_ID_CLASS_KEY: A [batch, max_thing_id] int32 + tf.Tensor. It contains semantic ID of each instance assigned to + thing_id_mask. The remaining (max_thing_id - num_things) elements are + set to -1. + - y_pred: A dict of tensors providing predictions. + - common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY: A [batch_size, + output_height, output_width, channels] float32 tensor. + - common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY: A [batch_size, + output_height, output_width, num_mask_slots] float32 tensor, the + logits that a pixel belongs to a mask slot. + - common.PRED_TRANSFORMER_CLASS_LOGITS_KEY: A [batch_size, + num_mask_slots, num_thing_stuff_classes + 1] float32 tensor, the + logits that a mask belongs to a semantic class (including thing, + stuff, and void) + + Returns: + The loss as a dict of tf.Tensor, optionally containing the following: + - common.PQ_STYLE_LOSS_CLASS_TERM: [batch]. + - common.PQ_STYLE_LOSS_MASK_DICE_TERM: [batch]. + - common.MASK_ID_CROSS_ENTROPY_LOSS: [batch]. + - common.INSTANCE_DISCRIMINATION_LOSS: [batch]. + """ + y_true, y_pred = inputs + resulting_dict = {} + + pixel_feature = y_pred[common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY] + batch_size, output_height, output_width, _ = ( + pixel_feature.get_shape().as_list()) + + # Pre-process the ground truth. + (pixel_gt_thing_mask, pixel_gt_non_void_mask, pixel_gt_mask_id_one_hot, + mask_gt_semantic_map, mask_gt_non_void_mask, mask_gt_semantic_one_hot, + mask_gt_area) = self._pre_process_ground_truth(y_true, + output_height, output_width) + pixel_gt_non_void_mask_expanded = tf.expand_dims( + pixel_gt_non_void_mask, axis=-1) + + # Compute mask_average_feature by averaging the feature of each mask. + pixel_feature = tf.reshape( + pixel_feature, [batch_size, output_height * output_width, -1]) + mask_average_feature = tf.einsum( + 'bpd,bpi->bid', + pixel_feature, + pixel_gt_mask_id_one_hot) / tf.maximum(mask_gt_area, 1.0) + # Normalize the mask feature as the pixel space output feature is usually + # normalized too. + mask_average_feature = tf.math.l2_normalize(mask_average_feature, axis=-1) + + # Compute instance_discrimination_similarity, scaled by a constant + # temperature. + instance_discrimination_similarity = tf.einsum( + 'bpd,bid->bpi', pixel_feature, mask_average_feature) + instance_discrimination_similarity /= ( + self._instance_discrimination_temperature) + mask_gt_non_void_mask_expanded_1 = tf.expand_dims( + mask_gt_non_void_mask, axis=1) + # Mask void masks by setting them to a large negative value, so that they + # will be ignored by the softmax in the loss. + instance_discrimination_similarity = ( + mask_gt_non_void_mask_expanded_1 * instance_discrimination_similarity + + (1.0 - mask_gt_non_void_mask_expanded_1) * _SOFTMAX_MASKING_CONSTANT) + + # Auxiliary instance_discrimination_loss. + if self._instance_discrimination_loss_weight > 0.0: + resulting_dict[common.INSTANCE_DISCRIMINATION_LOSS] = ( + self._instance_discrimination_loss( + {_GT_KEY: pixel_gt_mask_id_one_hot}, + {_PRED_KEY: instance_discrimination_similarity, + _WEIGHT_KEY: pixel_gt_thing_mask}) * + self._instance_discrimination_loss_weight) + + # Extract pixel_space_mask_logits and pixel_space_mask_probs. + pixel_space_mask_logits = y_pred[common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY] + pixel_space_mask_logits = tf.reshape( + pixel_space_mask_logits, + [batch_size, output_height * output_width, self._num_mask_slots]) + pixel_space_mask_probs = tf.nn.softmax(pixel_space_mask_logits, axis=-1) + + # Compute the mask similarity between all ground truth masks and all + # predicted masks. + mask_similarity = _mask_similarity( + pixel_gt_mask_id_one_hot, + pixel_space_mask_probs * pixel_gt_non_void_mask_expanded, + metric='dice') + + # Compute the class similarity by multiplying the ground truth one hot + # encoding with the predicted probability distribution. This is done between + # all ground truth masks and all predicted masks. + transformer_class_logits = y_pred[common.PRED_TRANSFORMER_CLASS_LOGITS_KEY] + transformer_class_probs = tf.nn.softmax( + transformer_class_logits, axis=-1)[:, :, :-1] + class_similarity = tf.einsum( + 'bij,bkj->bik', mask_gt_semantic_one_hot, transformer_class_probs) + + # Compute hungarian matching weights. We take the negative here since the + # hungarian matching algorithm looks for the matching with the least total + # weight. + hungarian_weights = - mask_similarity * class_similarity + mask_gt_non_void_mask_expanded_2 = tf.expand_dims( + mask_gt_non_void_mask, axis=2) + + # Mask the void ground truth masks (in the rows) so that they do not affect + # the result of the hungarian matching. + if self._num_mask_slots >= self._pixel_gt_num_mask_id: + # If the number of mask slots (number of columns) is larger than the + # constant number of ground truth masks (number of rows), the + # nonsquare_hungarian_matching will pad the rows with + # _MATCHING_NEGATIVE_CONSTANT. In this case, we can fill in the void mask + # rows with _MATCHING_NEGATIVE_CONSTANT too, then the void mask rows will + # be ignored too, according to the hungarian matching property. + hungarian_weights = ( + hungarian_weights * mask_gt_non_void_mask_expanded_2 + + (1 - mask_gt_non_void_mask_expanded_2) * _MATCHING_NEGATIVE_CONSTANT) + else: + # If the number of mask slots (number of columns) is smaller than the + # constant number of ground truth masks (number of rows), the + # nonsquare_hungarian_matching will pad the columns with + # _MATCHING_NEGATIVE_CONSTANT. In this case, we should fill in the void + # mask rows with _MATCHING_POSITIVE_CONSTANT here, then the void mask rows + # will have a huge cost compared with existing non-void mask rows, so that + # the predicted masks will prefer matching with existing non-void masks + # rather than the padded void masks, according to the hungarian matching + # property. + hungarian_weights = ( + hungarian_weights * mask_gt_non_void_mask_expanded_2 + + (1 - mask_gt_non_void_mask_expanded_2) * _MATCHING_POSITIVE_CONSTANT) + + # Perform the hungarian matching algorithm. + full_permutation, nonsquare_permutation = ( + nonsquare_hungarian_matching(hungarian_weights)) + + # Extract the permutation (matching) between all existing non-void ground + # truth masks and the matched predicted masks. + matched_permutation = ( + nonsquare_permutation * mask_gt_non_void_mask_expanded_2) + # The matched mask dice scores for each mask slot. The scores will be used + # as a loss weight for the PQ-style loss class term after the stop_gradient. + matched_mask_dice = tf.reduce_max( + mask_similarity * matched_permutation, axis=-2) + matched_mask_dice = tf.stop_gradient(matched_mask_dice) + + # The matched class probabilities for each ground truth mask. The + # probabilities will be used as a loss weight for the PQ-style loss mask + # dice term after the stop_gradient. + matched_class_prob = tf.reduce_max( + class_similarity * matched_permutation, axis=-1) + matched_class_prob = tf.stop_gradient(matched_class_prob) + + # Extract the index of the matched mask slot for each ground truth mask. + matched_mask_slot_indices = tf.math.argmax( + nonsquare_permutation, axis=-1, output_type=tf.dtypes.int32) + + full_num_mask_slots = full_permutation.get_shape().as_list()[-1] + # Pad the pixel_space_mask_logits so that it is compatible with the + # permutation matrix. + full_pixel_space_mask_logits = tf.pad( + pixel_space_mask_logits, + [[0, 0], [0, 0], [0, full_num_mask_slots - self._num_mask_slots]], + constant_values=_SOFTMAX_MASKING_CONSTANT) + + # Permute the pixel space mask logits with the permutation matrix, which + # converts the mask slot indices to the ground truth indices. + permuted_full_pixel_space_mask_logits = tf.einsum( + 'bpi,bji->bpj', full_pixel_space_mask_logits, full_permutation) + + # Pad the class probabilities too. + full_matched_class_prob = tf.pad( + matched_class_prob, + [[0, 0], [0, full_num_mask_slots - self._pixel_gt_num_mask_id]]) + # We only compute dice loss term on non-void ground truth masks. + mask_dice_term_loss_weight = tf.pad( + mask_gt_non_void_mask, + [[0, 0], [0, full_num_mask_slots - self._pixel_gt_num_mask_id]]) + # Use the class probabilities as the loss weight for the mask dice term. In + # addition, we set a lower bound, 1e-5, for the mask dice term loss weight. + # Otherwise, if a loss weight is accidentally zero, the dice loss will treat + # it as void and use an incorrect denominator or normalizing constant for + # the loss. + mask_dice_term_loss_weight *= tf.maximum(full_matched_class_prob, 1e-5) + + # Pad the one hot encoding too. + full_pixel_gt_mask_id_one_hot = tf.pad( + pixel_gt_mask_id_one_hot, + [[0, 0], [0, 0], [0, full_num_mask_slots - self._pixel_gt_num_mask_id]]) + + if self._pq_style_loss_weight > 0.0: + # Mask_dice_term_modifier balances the mask_dice_term and the class_term + # of the PQ-style loss to have the same weight and normalizating constant. + resulting_dict[common.PQ_STYLE_LOSS_MASK_DICE_TERM] = ( + self._pq_style_loss_mask_dice_term( + {_GT_KEY: full_pixel_gt_mask_id_one_hot}, + {_PRED_KEY: permuted_full_pixel_space_mask_logits, + _WEIGHT_KEY: mask_dice_term_loss_weight}) * + (self._pq_style_loss_weight * self._mask_dice_term_modifier)) + + # Mask-ID cross entropy loss shares the same ground truth and logits as the + # dice loss term, but with different weights. + if self._mask_id_cross_entropy_loss_weight > 0.0: + resulting_dict[common.MASK_ID_CROSS_ENTROPY_LOSS] = ( + self._mask_id_cross_entropy_loss( + {_GT_KEY: full_pixel_gt_mask_id_one_hot}, + {_PRED_KEY: permuted_full_pixel_space_mask_logits, + _WEIGHT_KEY: pixel_gt_non_void_mask}) * + self._mask_id_cross_entropy_loss_weight) + + # Generate a pseudo ground truth for transformer_class_logits. + mask_slot_semantic_one_hot = _generate_mask_slot_semantic_one_hot( + matched_mask_slot_indices, mask_gt_semantic_map, + self._num_mask_slots, self._thing_stuff_class_ids) + + # Compute the positive mask and the negative mask. + mask_slot_positive_mask = tf.cast(tf.equal(tf.reduce_max( + mask_slot_semantic_one_hot, axis=-1), 1.0), tf.float32) + mask_slot_negative_mask = 1.0 - mask_slot_positive_mask + + # Compute the overlap ratio between all predicted masks and the void region. + # This void ratio will be used as a weight for the negative class term. + mask_void_ratio = tf.stop_gradient(_mask_similarity( + 1.0 - pixel_gt_non_void_mask_expanded, + pixel_space_mask_probs, + 'intersection_over_prediction')) + mask_void_ratio = tf.squeeze(mask_void_ratio, axis=1) + + # Use the matched mask dice scores as the weights for the positive class + # terms. For the negative class terms, we reduce the penalty for a mask slot + # class term if the mask prediction overlaps a lot with void regions. + transformer_class_loss_weight = ( + mask_slot_positive_mask * tf.maximum(matched_mask_dice, 1e-5) + + mask_slot_negative_mask * tf.maximum(mask_void_ratio, 1e-5)) + + # Concatenate the void mask in the last channel, constructing the final + # ground truth one hot label with (thing + stuff + void) channels. + transformer_class_one_hot = tf.concat( + [mask_slot_semantic_one_hot, + tf.expand_dims(mask_slot_negative_mask, axis=-1)], axis=-1) + + # Apply the PQ-style loss class term. + if self._pq_style_loss_weight > 0.0: + resulting_dict[common.PQ_STYLE_LOSS_CLASS_TERM] = ( + self._pq_style_loss_class_term( + {_GT_KEY: transformer_class_one_hot}, + {_PRED_KEY: transformer_class_logits, + _WEIGHT_KEY: transformer_class_loss_weight}) * + self._pq_style_loss_weight) + + return resulting_dict diff --git a/model/loss/max_deeplab_loss_test.py b/model/loss/max_deeplab_loss_test.py new file mode 100644 index 0000000000000000000000000000000000000000..895bca5a2246a35eef90ae54273499f6684772cd --- /dev/null +++ b/model/loss/max_deeplab_loss_test.py @@ -0,0 +1,103 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for max_deeplab_loss.py.""" + +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import trainer_pb2 +from deeplab2.data import dataset +from deeplab2.model.loss import max_deeplab_loss + + +class MaXDeepLabLossTest(tf.test.TestCase): + + def test_max_deeplab_loss(self): + # Build the loss layer. + dataset_info = dataset.COCO_PANOPTIC_INFORMATION + semantic_loss_options = trainer_pb2.LossOptions.SingleLossOptions( + name='softmax_cross_entropy') + pq_style_loss_options = trainer_pb2.LossOptions.SingleLossOptions() + mask_id_cross_entropy_loss_options = ( + trainer_pb2.LossOptions.SingleLossOptions()) + instance_discrimination_loss_options = ( + trainer_pb2.LossOptions.SingleLossOptions()) + loss_options_1 = trainer_pb2.LossOptions( + semantic_loss=semantic_loss_options, + pq_style_loss=pq_style_loss_options, + mask_id_cross_entropy_loss=mask_id_cross_entropy_loss_options, + instance_discrimination_loss=instance_discrimination_loss_options) + loss_layer_1 = max_deeplab_loss.MaXDeepLabLoss( + loss_options_1, + ignore_label=dataset_info.ignore_label, + thing_class_ids=dataset_info.class_has_instances_list) + loss_options_2 = trainer_pb2.LossOptions( + pq_style_loss=pq_style_loss_options) + loss_layer_2 = max_deeplab_loss.MaXDeepLabLoss( + loss_options_2, + ignore_label=dataset_info.ignore_label, + thing_class_ids=dataset_info.class_has_instances_list) + + # Build the inputs. + pred_dict = { + common.PRED_PIXEL_SPACE_NORMALIZED_FEATURE_KEY: + tf.random.uniform(shape=[2, 9, 9, 8]), + common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY: + tf.random.uniform(shape=[2, 9, 9, 128]), + common.PRED_TRANSFORMER_CLASS_LOGITS_KEY: + tf.random.uniform(shape=[2, 128, 134]), + } + gt_dict = { + common.GT_SEMANTIC_KEY: tf.ones(shape=[2, 33, 33], dtype=tf.int32), + common.GT_THING_ID_MASK_KEY: tf.ones(shape=[2, 33, 33], + dtype=tf.int32), + common.GT_THING_ID_CLASS_KEY: tf.concat( + # An image with ten people (class_id = 1) and 118 void masks. + [tf.ones(shape=[2, 10], dtype=tf.int32), + -tf.ones(shape=[2, 118], dtype=tf.int32)], axis=-1), + } + loss_dict_1 = loss_layer_1((gt_dict, pred_dict)) + + self.assertIn(common.PQ_STYLE_LOSS_CLASS_TERM, loss_dict_1) + self.assertIn(common.PQ_STYLE_LOSS_MASK_DICE_TERM, loss_dict_1) + self.assertIn(common.MASK_ID_CROSS_ENTROPY_LOSS, loss_dict_1) + self.assertIn(common.INSTANCE_DISCRIMINATION_LOSS, loss_dict_1) + self.assertNotIn(common.PQ_STYLE_LOSS, loss_dict_1) + + self.assertIn(common.PQ_STYLE_LOSS_CLASS_TERM, loss_layer_1.loss_terms) + self.assertIn(common.PQ_STYLE_LOSS_MASK_DICE_TERM, loss_layer_1.loss_terms) + self.assertIn(common.MASK_ID_CROSS_ENTROPY_LOSS, loss_layer_1.loss_terms) + self.assertIn(common.INSTANCE_DISCRIMINATION_LOSS, loss_layer_1.loss_terms) + self.assertNotIn(common.PQ_STYLE_LOSS, loss_layer_1.loss_terms) + + loss_dict_2 = loss_layer_2((gt_dict, pred_dict)) + + self.assertIn(common.PQ_STYLE_LOSS_CLASS_TERM, loss_dict_2) + self.assertIn(common.PQ_STYLE_LOSS_MASK_DICE_TERM, loss_dict_2) + self.assertNotIn(common.MASK_ID_CROSS_ENTROPY_LOSS, loss_dict_2) + self.assertNotIn(common.INSTANCE_DISCRIMINATION_LOSS, loss_dict_2) + self.assertNotIn(common.PQ_STYLE_LOSS, loss_dict_2) + + self.assertIn(common.PQ_STYLE_LOSS_CLASS_TERM, loss_layer_2.loss_terms) + self.assertIn(common.PQ_STYLE_LOSS_MASK_DICE_TERM, loss_layer_2.loss_terms) + self.assertNotIn(common.MASK_ID_CROSS_ENTROPY_LOSS, loss_layer_2.loss_terms) + self.assertNotIn(common.INSTANCE_DISCRIMINATION_LOSS, + loss_layer_2.loss_terms) + self.assertNotIn(common.PQ_STYLE_LOSS, loss_layer_2.loss_terms) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/post_processor/__init__.py b/model/post_processor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e4ce02ff422f3aa84ab644b88d65b13e0cbc03 --- /dev/null +++ b/model/post_processor/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/model/post_processor/max_deeplab.py b/model/post_processor/max_deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..ab809c2dc9cdfb4ca308bbdd051f08508726d78d --- /dev/null +++ b/model/post_processor/max_deeplab.py @@ -0,0 +1,464 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains functions to post-process MaX-DeepLab results.""" + +import functools +from typing import List, Tuple, Dict, Text + +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.data import dataset +from deeplab2.model import utils + + +def _get_transformer_class_prediction( + transformer_class_probs: tf.Tensor, + transformer_class_confidence_threshold: float + ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: + """Computes the transformer class prediction and confidence score. + + Args: + transformer_class_probs: A tf.Tensor of shape [num_mask_slots, + num_thing_stuff_classes + 1]. It is a pixel level logit scores where the + num_mask_slots is the number of mask slots (for both thing classes and + stuff classes) in MaX-DeepLab. The last channel indicates a `void` class. + transformer_class_confidence_threshold: A float for thresholding the + confidence of the transformer_class_probs. The panoptic mask slots with + class confidence less than the threshold are filtered and not used for + panoptic prediction. Only masks whose confidence is larger than the + threshold are counted in num_detections. + + Returns: + A tuple of: + - the detected mask class prediction as float32 tf.Tensor of shape + [num_detections]. + - the detected mask indices as tf.Tensor of shape [num_detections]. + - the number of detections as tf.Tensor of shape [1]. + """ + transformer_class_pred = tf.cast( + tf.argmax(transformer_class_probs, axis=-1), tf.float32) + transformer_class_confidence = tf.reduce_max( + transformer_class_probs, axis=-1, keepdims=False) + # Filter mask IDs with class confidence less than the threshold. + thresholded_mask = tf.cast( + tf.greater_equal(transformer_class_confidence, + transformer_class_confidence_threshold), tf.float32) + transformer_class_confidence = (transformer_class_confidence + * thresholded_mask) + + detected_mask_indices = tf.where(tf.greater(thresholded_mask, 0.5))[:, 0] + detected_mask_class_pred = tf.gather( + transformer_class_pred, detected_mask_indices) + num_detections = tf.shape(detected_mask_indices)[0] + return detected_mask_class_pred, detected_mask_indices, num_detections + + +def _get_mask_id_and_semantic_maps( + thing_class_ids: List[int], + stuff_class_ids: List[int], + pixel_space_mask_logits: tf.Tensor, + transformer_class_probs: tf.Tensor, + image_shape: List[int], + pixel_confidence_threshold=0.4, + transformer_class_confidence_threshold=0.7, + pieces=1) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]: + """Computes the pixel-level mask ID map and semantic map per image. + + Args: + thing_class_ids: A List of integers of shape [num_thing_classes] containing + thing class indices. + stuff_class_ids: A List of integers of shape [num_thing_classes] containing + stuff class indices. + pixel_space_mask_logits: A tf.Tensor of shape [height, width, + num_mask_slots]. It is a pixel level logit scores where the + num_mask_slots is the number of mask slots (for both thing classes + and stuff classes) in MaX-DeepLab. + transformer_class_probs: A tf.Tensor of shape [num_mask_slots, + num_thing_stuff_classes + 1]. It is a pixel level logit scores where the + num_mask_slots is the number of mask slots (for both thing classes and + stuff classes) in MaX-DeepLab. The last channel indicates a `void` class. + image_shape: A list of integers specifying the [height, width] of input + image. + pixel_confidence_threshold: A float indicating a threshold for the pixel + level softmax probability confidence of transformer mask logits. If less + than the threshold, the pixel locations have confidence `0` in + `confident_regions` output, and represent `void` (ignore) regions. + transformer_class_confidence_threshold: A float for thresholding the + confidence of the transformer_class_probs. The panoptic mask slots with + class confidence less than the threshold are filtered and not used for + panoptic prediction. + pieces: An integer indicating the number of pieces in the piece-wise + operation. When computing panpotic prediction and confident regions, the + mask logits are divided width-wise into multiple pieces and processed + piece-wise due to the GPU memory limit. Then, the piece-wise outputs are + concatenated along the width into the original mask shape. Defaults to 1. + + Returns: + A tuple of: + - the mask ID prediction as tf.Tensor with shape [height, width]. + - the semantic prediction as tf.Tensor with shape [height, width]. + - the thing region mask as tf.Tensor with shape [height, width]. + - the stuff region mask as tf.Tensor with shape [height, width]. + + Raises: + ValueError: When input image's `width - 1` is not divisible by `pieces`. + """ + # The last channel indicates `void` class and thus is not included. + transformer_class_probs = transformer_class_probs[..., :-1] + # Generate mapping from mask IDs to dataset's thing and stuff semantic IDs. + thing_stuff_class_ids = thing_class_ids + stuff_class_ids + + detected_mask_class_pred, detected_mask_indices, num_detections = ( + _get_transformer_class_prediction(transformer_class_probs, + transformer_class_confidence_threshold)) + # If num_detections = 0, return empty result maps. + def _return_empty_mask_id_and_semantic_maps(): + return ( + tf.ones([image_shape[0], image_shape[1]], dtype=tf.int32), + tf.zeros([image_shape[0], image_shape[1]], dtype=tf.int32), + tf.zeros([image_shape[0], image_shape[1]], dtype=tf.float32), + tf.zeros([image_shape[0], image_shape[1]], dtype=tf.float32)) + + # If num_detections > 0: + def _generate_mask_id_and_semantic_maps(): + output_mask_id_map = [] + output_confident_region = [] + logits_width = pixel_space_mask_logits.get_shape().as_list()[1] + output_width = image_shape[1] + + if (output_width - 1) % pieces > 0: + raise ValueError('`output_width - 1` must be divisible by `pieces`.') + # Use of input shape of a multiple of the feature stride, plus one, so that + # it preserves left- and right-alignment. + piece_output_width = (output_width - 1) // pieces + 1 + + for piece_id in range(pieces): + piece_begin = (logits_width - 1) // pieces * piece_id + # Use of input shape of a multiple of the feature stride, plus one, so + # that it preserves left- and right-alignment. + piece_end = (logits_width - 1) // pieces * (piece_id + 1) + 1 + piece_pixel_mask_logits = ( + pixel_space_mask_logits[:, piece_begin:piece_end, :]) + piece_pixel_mask_logits = tf.compat.v1.image.resize_bilinear( + tf.expand_dims(piece_pixel_mask_logits, 0), + (image_shape[0], piece_output_width), + align_corners=True) + piece_pixel_mask_logits = tf.squeeze(piece_pixel_mask_logits, axis=0) + piece_detected_pixel_mask_logits = tf.gather( + piece_pixel_mask_logits, detected_mask_indices, axis=-1) + # Filter the pixels which are assigned to a mask ID that does not survive. + piece_max_logits = tf.reduce_max(piece_pixel_mask_logits, axis=-1) + piece_detected_max_logits = tf.reduce_max( + piece_detected_pixel_mask_logits, axis=-1) + piece_detected_mask = tf.cast(tf.math.equal( + piece_max_logits, piece_detected_max_logits), tf.float32) + # Filter with pixel mask threshold. + piece_pixel_confidence_map = tf.reduce_max( + tf.nn.softmax(piece_detected_pixel_mask_logits, axis=-1), axis=-1) + piece_confident_region = tf.cast( + piece_pixel_confidence_map > pixel_confidence_threshold, tf.float32) + piece_confident_region = piece_confident_region * piece_detected_mask + piece_mask_id_map = tf.cast( + tf.argmax(piece_detected_pixel_mask_logits, axis=-1), tf.int32) + if piece_id == pieces - 1: + output_mask_id_map.append(piece_mask_id_map) + output_confident_region.append(piece_confident_region) + else: + output_mask_id_map.append(piece_mask_id_map[:, :-1]) + output_confident_region.append(piece_confident_region[:, :-1]) + + mask_id_map = tf.concat(output_mask_id_map, axis=1) + confident_region = tf.concat(output_confident_region, axis=1) + mask_id_map_flat = tf.reshape(mask_id_map, [-1]) + mask_id_semantic_map_flat = tf.gather( + detected_mask_class_pred, mask_id_map_flat) + mask_id_semantic_map = tf.reshape( + mask_id_semantic_map_flat, [image_shape[0], image_shape[1]]) + # Generate thing and stuff masks (with value 1/0 indicates the + # presence/absence) + thing_mask = tf.cast(mask_id_semantic_map < len(thing_class_ids), + tf.float32) * confident_region + stuff_mask = tf.cast(mask_id_semantic_map >= len(thing_class_ids), + tf.float32) * confident_region + # Generate semantic_map. + semantic_map = tf.gather( + tf.convert_to_tensor(thing_stuff_class_ids), + tf.cast(tf.round(mask_id_semantic_map_flat), tf.int32)) + semantic_map = tf.reshape(semantic_map, [image_shape[0], image_shape[1]]) + # Add 1 because mask ID 0 is reserved for unconfident region. + mask_id_map_plus_one = mask_id_map + 1 + semantic_map = tf.cast(tf.round(semantic_map), tf.int32) + return (mask_id_map_plus_one, semantic_map, thing_mask, stuff_mask) + + mask_id_map_plus_one, semantic_map, thing_mask, stuff_mask = tf.cond( + tf.cast(num_detections, tf.float32) < tf.cast(0.5, tf.float32), + _return_empty_mask_id_and_semantic_maps, + _generate_mask_id_and_semantic_maps) + + return (mask_id_map_plus_one, semantic_map, thing_mask, stuff_mask) + + +def _filter_by_count(input_index_map: tf.Tensor, + area_limit: int) -> Tuple[tf.Tensor, tf.Tensor]: + """Filters input index map by area limit threshold per index. + + Args: + input_index_map: A float32 tf.Tensor of shape [batch, height, width]. + area_limit: An integer specifying the number of pixels that each index + regions need to have at least. If not over the limit, the index regions + are masked (zeroed) out. + + Returns: + masked input_index_map: A tf.Tensor with shape [batch, height, width], + masked by the area_limit threshold. + mask: A tf.Tensor with shape [batch, height, width]. It is a pixel-level + mask with 1. indicating the regions over the area limit, and 0. otherwise. + """ + batch_size = tf.shape(input_index_map)[0] + index_map = tf.cast(tf.round(input_index_map), tf.int32) + index_map_flat = tf.reshape(index_map, [batch_size, -1]) + counts = tf.math.bincount(index_map_flat, axis=-1) + counts_map = tf.gather(counts, index_map_flat, batch_dims=1) + counts_map = tf.reshape(counts_map, tf.shape(index_map)) + + mask = tf.cast( + tf.cast(counts_map, tf.float32) > tf.cast(area_limit - 0.5, tf.float32), + input_index_map.dtype) + return input_index_map * mask, mask + + +def _merge_mask_id_and_semantic_maps( + mask_id_maps_plus_one: tf.Tensor, + semantic_maps: tf.Tensor, + thing_masks: tf.Tensor, + stuff_masks: tf.Tensor, + void_label: int, + label_divisor: int, + thing_area_limit: int, + stuff_area_limit: int,) -> tf.Tensor: + """Merges mask_id maps and semantic_maps to obtain panoptic segmentation. + + Args: + mask_id_maps_plus_one: A tf.Tensor of shape [batch, height, width]. + semantic_maps: A tf.Tensor of shape [batch, height, width]. + thing_masks: A float32 tf.Tensor of shape [batch, height, width] containing + masks with 1. at thing regions, 0. otherwise. + stuff_masks: A float32 tf.Tensor of shape [batch, height, width] containing + masks with 1. at thing regions, 0. otherwise. + void_label: An integer specifying the void label. + label_divisor: An integer specifying the label divisor of the dataset. + thing_area_limit: An integer specifying the number of pixels that thing + regions need to have at least. The thing region will be included in the + panoptic prediction, only if its area is larger than the limit; otherwise, + it will be re-assigned as void_label. + stuff_area_limit: An integer specifying the number of pixels that stuff + regions need to have at least. The stuff region will be included in the + panoptic prediction, only if its area is larger than the limit; otherwise, + it will be re-assigned as void_label. + + Returns: + panoptic_maps: A tf.Tensor with shape [batch, height, width]. + + """ + thing_mask_id_maps_plus_one = (tf.cast(mask_id_maps_plus_one, tf.float32) + * thing_masks) + # We increase semantic_maps by 1 before masking (zeroing) by thing_masks and + # stuff_masks, to ensure all valid semantic IDs are greater than 0 and thus + # not masked out. + semantic_maps_plus_one = semantic_maps + 1 + tf.debugging.assert_less( + tf.reduce_sum(thing_masks * stuff_masks), 0.5, + message='thing_masks and stuff_masks must be mutually exclusive.') + + thing_semantic_maps = (tf.cast(semantic_maps_plus_one, tf.float32) + * thing_masks) + stuff_semantic_maps = (tf.cast(semantic_maps_plus_one, tf.float32) + * stuff_masks) + + # Filter stuff_semantic_maps by stuff_area_limit. + stuff_semantic_maps, _ = _filter_by_count( + stuff_semantic_maps, stuff_area_limit) + # Filter thing_mask_id_map and thing_semantic_map by thing_area_limit + thing_mask_id_maps_plus_one, mask_id_count_filter_mask = _filter_by_count( + thing_mask_id_maps_plus_one, thing_area_limit) + thing_semantic_maps = thing_semantic_maps * mask_id_count_filter_mask + + # Filtered un-confident region will be replaced with `void_label`. The + # "plus_one" will be reverted, the un-confident region (0) will be -1, and so + # we add (void + 1) + semantic_maps_new = thing_semantic_maps + stuff_semantic_maps - 1.0 + semantic_maps_new = (tf.cast(semantic_maps_new < -0.5, tf.float32) + * tf.cast(void_label + 1, tf.float32) + + semantic_maps_new) + panoptic_maps = (semantic_maps_new * label_divisor + + thing_mask_id_maps_plus_one) + panoptic_maps = tf.cast(tf.round(panoptic_maps), tf.int32) + return panoptic_maps + + +def _get_panoptic_predictions( + pixel_space_mask_logits: tf.Tensor, + transformer_class_logits: tf.Tensor, + thing_class_ids: List[int], + void_label: int, + label_divisor: int, + thing_area_limit: int, + stuff_area_limit: int, + image_shape: List[int], + pixel_confidence_threshold=0.4, + transformer_class_confidence_threshold=0.7, + pieces=1) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: + """Computes the pixel-level panoptic, mask ID, and semantic maps. + + Args: + pixel_space_mask_logits: A tf.Tensor of shape [batch, strided_height, + strided_width, num_mask_slots]. It is a pixel level logit scores where the + num_mask_slots is the number of mask slots (for both thing classes + and stuff classes) in MaX-DeepLab. + transformer_class_logits: A tf.Tensor of shape [batch, num_mask_slots, + num_thing_stuff_classes + 1]. It is a pixel level logit scores where the + num_mask_slots is the number of mask slots (for both thing classes and + stuff classes) in MaX-DeepLab. The last channel indicates a `void` class. + thing_class_ids: A List of integers of shape [num_thing_classes] containing + thing class indices. + void_label: An integer specifying the void label. + label_divisor: An integer specifying the label divisor of the dataset. + thing_area_limit: An integer specifying the number of pixels that thing + regions need to have at least. The thing region will be included in the + panoptic prediction, only if its area is larger than the limit; otherwise, + it will be re-assigned as void_label. + stuff_area_limit: An integer specifying the number of pixels that stuff + regions need to have at least. The stuff region will be included in the + panoptic prediction, only if its area is larger than the limit; otherwise, + it will be re-assigned as void_label. + image_shape: A list of integers specifying the [height, width] of input + image. + pixel_confidence_threshold: A float indicating a threshold for the pixel + level softmax probability confidence of transformer mask logits. If less + than the threshold, the pixel locations have confidence `0` in + `confident_regions` output, and represent `void` (ignore) regions. + transformer_class_confidence_threshold: A float for thresholding the + confidence of the transformer_class_probs. The panoptic mask slots with + class confidence less than the threshold are filtered and not used for + panoptic prediction. + pieces: An integer indicating the number of pieces in the piece-wise + operation in `_get_mask_id_and_semantic_maps`. When computing panoptic + prediction and confident regions, the mask logits are divided width-wise + into multiple pieces and processed piece-wise due to the GPU memory limit. + Then, the piece-wise outputs are concatenated along the width into the + original mask shape. Defaults to 1. + + Returns: + A tuple of: + - the panoptic prediction as tf.Tensor with shape [batch, height, width]. + - the mask ID prediction as tf.Tensor with shape [batch, height, width]. + - the semantic prediction as tf.Tensor with shape [batch, height, width]. + """ + transformer_class_probs = tf.nn.softmax(transformer_class_logits, axis=-1) + batch_size = tf.shape(transformer_class_logits)[0] + # num_thing_stuff_classes does not include `void` class, so we decrease by 1. + num_thing_stuff_classes = ( + transformer_class_logits.get_shape().as_list()[-1] - 1) + # Generate thing and stuff class ids + stuff_class_ids = utils.get_stuff_class_ids( + num_thing_stuff_classes, thing_class_ids, void_label) + + mask_id_map_plus_one_lists = tf.TensorArray( + tf.int32, size=batch_size, dynamic_size=False) + semantic_map_lists = tf.TensorArray( + tf.int32, size=batch_size, dynamic_size=False) + thing_mask_lists = tf.TensorArray( + tf.float32, size=batch_size, dynamic_size=False) + stuff_mask_lists = tf.TensorArray( + tf.float32, size=batch_size, dynamic_size=False) + for i in tf.range(batch_size): + mask_id_map_plus_one, semantic_map, thing_mask, stuff_mask = ( + _get_mask_id_and_semantic_maps( + thing_class_ids, stuff_class_ids, + pixel_space_mask_logits[i, ...], transformer_class_probs[i, ...], + image_shape, pixel_confidence_threshold, + transformer_class_confidence_threshold, pieces) + ) + mask_id_map_plus_one_lists = mask_id_map_plus_one_lists.write( + i, mask_id_map_plus_one) + semantic_map_lists = semantic_map_lists.write(i, semantic_map) + thing_mask_lists = thing_mask_lists.write(i, thing_mask) + stuff_mask_lists = stuff_mask_lists.write(i, stuff_mask) + # This does not work with unknown shapes. + mask_id_maps_plus_one = mask_id_map_plus_one_lists.stack() + semantic_maps = semantic_map_lists.stack() + thing_masks = thing_mask_lists.stack() + stuff_masks = stuff_mask_lists.stack() + + panoptic_maps = _merge_mask_id_and_semantic_maps( + mask_id_maps_plus_one, semantic_maps, thing_masks, stuff_masks, + void_label, label_divisor, thing_area_limit, stuff_area_limit) + return panoptic_maps, mask_id_maps_plus_one, semantic_maps + + +class PostProcessor(tf.keras.layers.Layer): + """This class contains code of a MaX-DeepLab post-processor.""" + + def __init__( + self, + config: config_pb2.ExperimentOptions, + dataset_descriptor: dataset.DatasetDescriptor): + """Initializes a MaX-DeepLab post-processor. + + Args: + config: A config_pb2.ExperimentOptions configuration. + dataset_descriptor: A dataset.DatasetDescriptor. + """ + super(PostProcessor, self).__init__(name='PostProcessor') + self._post_processor = functools.partial( + _get_panoptic_predictions, + thing_class_ids=list(dataset_descriptor.class_has_instances_list), + void_label=dataset_descriptor.ignore_label, + label_divisor=dataset_descriptor.panoptic_label_divisor, + thing_area_limit=config.evaluator_options.thing_area_limit, + stuff_area_limit=config.evaluator_options.stuff_area_limit, + image_shape=list(config.eval_dataset_options.crop_size), + transformer_class_confidence_threshold=config.evaluator_options + .transformer_class_confidence_threshold, + pixel_confidence_threshold=config.evaluator_options + .pixel_confidence_threshold, + pieces=1) + + def call(self, result_dict: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]: + """Performs the post-processing given model predicted results. + + Args: + result_dict: A dictionary of tf.Tensor containing model results. The dict + has to contain + - common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY, + - common.PRED_TRANSFORMER_CLASS_LOGITS_KEY, + + Returns: + The post-processed dict of tf.Tensor, containing the following: + - common.PRED_SEMANTIC_KEY, + - common.PRED_INSTANCE_KEY, + - common.PRED_PANOPTIC_KEY, + """ + processed_dict = {} + (processed_dict[common.PRED_PANOPTIC_KEY], + processed_dict[common.PRED_INSTANCE_KEY], + processed_dict[common.PRED_SEMANTIC_KEY] + ) = self._post_processor( + result_dict[common.PRED_PIXEL_SPACE_MASK_LOGITS_KEY], + result_dict[common.PRED_TRANSFORMER_CLASS_LOGITS_KEY]) + return processed_dict diff --git a/model/post_processor/max_deeplab_test.py b/model/post_processor/max_deeplab_test.py new file mode 100644 index 0000000000000000000000000000000000000000..639e3c515c20ce9e498bd23c39965951b7514823 --- /dev/null +++ b/model/post_processor/max_deeplab_test.py @@ -0,0 +1,231 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test for max_deeplab.py.""" +import numpy as np +import tensorflow as tf + +from deeplab2.model.post_processor import max_deeplab + + +class PostProcessingTest(tf.test.TestCase): + + def test_filter_by_count(self): + input_index_map = tf.convert_to_tensor( + [[[1, 1, 1, 1], + [1, 2, 2, 1], + [3, 3, 3, 3], + [4, 5, 5, 5]], + [[4, 5, 5, 5], + [3, 3, 3, 3], + [1, 2, 2, 1], + [1, 1, 1, 1]]], dtype=tf.float32) + area_limit = 3 + filtered_index_map, mask = max_deeplab._filter_by_count( + input_index_map, area_limit) + + expected_filtered_index_map = tf.convert_to_tensor( + [[[1, 1, 1, 1], + [1, 0, 0, 1], + [3, 3, 3, 3], + [0, 5, 5, 5]], + [[0, 5, 5, 5], + [3, 3, 3, 3], + [1, 0, 0, 1], + [1, 1, 1, 1]]], dtype=tf.float32) + np.testing.assert_equal(filtered_index_map.numpy(), + expected_filtered_index_map.numpy()) + expected_mask = tf.convert_to_tensor( + [[[1, 1, 1, 1], + [1, 0, 0, 1], + [1, 1, 1, 1], + [0, 1, 1, 1]], + [[0, 1, 1, 1], + [1, 1, 1, 1], + [1, 0, 0, 1], + [1, 1, 1, 1]]], dtype=tf.float32) + np.testing.assert_equal(mask.numpy(), expected_mask.numpy()) + + def test_get_mask_id_and_semantic_maps(self): + height = 21 + width = 21 + num_mask_slots = 5 + num_thing_stuff_classes = 19 + thing_class_ids = list(range(11, 19)) + stuff_class_ids = list(range(0, 11)) + pixel_space_mask_logits = tf.random.uniform( + (height, width, num_mask_slots), minval=-10, maxval=10) + # Class scores are normalized beforehand (softmax-ed beforehand). + transformer_class_probs = tf.random.uniform( + (num_mask_slots, num_thing_stuff_classes + 1), minval=0, maxval=1) + input_shape = [41, 41] + pixel_confidence_threshold = 0.4 + transformer_class_confidence_threshold = 0.7 + pieces = 2 + + mask_id_map, semantic_map, thing_mask, stuff_mask = ( + max_deeplab._get_mask_id_and_semantic_maps( + thing_class_ids, stuff_class_ids, pixel_space_mask_logits, + transformer_class_probs, input_shape, pixel_confidence_threshold, + transformer_class_confidence_threshold, pieces) + ) + self.assertListEqual(mask_id_map.get_shape().as_list(), input_shape) + self.assertListEqual(semantic_map.get_shape().as_list(), input_shape) + self.assertListEqual(thing_mask.get_shape().as_list(), input_shape) + self.assertListEqual(stuff_mask.get_shape().as_list(), input_shape) + + def test_merge_mask_id_and_semantic_maps(self): + mask_id_maps = tf.convert_to_tensor( + [[[1, 1, 1, 1], + [1, 2, 2, 1], + [3, 3, 4, 4], + [5, 5, 6, 6]]], dtype=tf.int32) + semantic_maps = tf.convert_to_tensor( + [[[0, 0, 0, 0], + [0, 1, 1, 0], + [2, 2, 2, 2], + [2, 2, 3, 3]]], dtype=tf.int32) + thing_masks = tf.convert_to_tensor( + [[[0, 0, 0, 0], + [0, 0, 0, 0], + [1, 1, 1, 1], + [1, 0, 1, 1]]], dtype=tf.float32) # thing_class_ids = [2, 3] + stuff_masks = tf.convert_to_tensor( + [[[1, 1, 1, 0], + [1, 1, 1, 1], + [0, 0, 0, 0], + [0, 0, 0, 0]]], dtype=tf.float32) # stuff_class_ids = [0, 1] + + batch_size = 3 + mask_id_maps = tf.repeat(mask_id_maps, repeats=batch_size, axis=0) + semantic_maps = tf.repeat(semantic_maps, repeats=batch_size, axis=0) + thing_masks = tf.repeat(thing_masks, repeats=batch_size, axis=0) + stuff_masks = tf.repeat(stuff_masks, repeats=batch_size, axis=0) + + label_divisor = 100 + stuff_area_limit = 3 + void_label = 255 + thing_area_limit = 2 + # The expected_panoptic_prediction is computed as follows. + # All un-certain regions will be labeled as `void_label * label_divisor`. + # For `thing` segmentation, instance 3, 4, and 6 are kept, but instance 5 + # is re-labeled as `void_label * label_divisor` since its area had been + # reduced by `confident_regions` and is then filtered by thing_area_limit. + # For `stuff` segmentation, class-0 region is kept, while class-1 region + # is re-labeled as `void_label * label_divisor` since its area is smaller + # than stuff_area_limit. + expected_panoptic_prediction = tf.convert_to_tensor( + [[[0, 0, 0, void_label * label_divisor], + [0, void_label * label_divisor, void_label * label_divisor, 0], + [2 * label_divisor + 3, 2 * label_divisor + 3, 2 * label_divisor + 4, + 2 * label_divisor + 4], + [void_label * label_divisor, void_label * label_divisor, + 3 * label_divisor + 6, 3 * label_divisor + 6]]], + dtype=tf.int32) + expected_panoptic_prediction = tf.repeat( + expected_panoptic_prediction, repeats=batch_size, axis=0) + panoptic_prediction = ( + max_deeplab._merge_mask_id_and_semantic_maps( + mask_id_maps, semantic_maps, thing_masks, stuff_masks, void_label, + label_divisor, thing_area_limit, stuff_area_limit)) + + np.testing.assert_equal(expected_panoptic_prediction.numpy(), + panoptic_prediction.numpy()) + + def test_get_panoptic_predictions(self): + batch = 1 + height = 5 + width = 5 + num_thing_stuff_classes = 2 + thing_class_ids = list(range(1, num_thing_stuff_classes + 1)) # [1, 2] + label_divisor = 10 + stuff_area_limit = 3 + void_label = 0 # `class-0` is `void` + + o, x = 10, -10 + pixel_space_mask_logits = tf.convert_to_tensor( + [[[[o, o, o, o, o], # instance-1 mask + [o, x, x, o, o], + [x, x, x, x, x], + [x, x, x, x, x], + [x, x, x, x, x]], + + [[x, x, x, x, x], # instance-2 mask + [x, o, o, x, x], + [x, o, o, x, x], + [x, o, o, x, x], + [x, x, x, x, x]], + + [[x, x, x, x, x], # instance-3 mask + [x, x, x, x, x], + [o, x, x, o, o], + [o, x, x, o, o], + [o, o, o, o, o]]]], + dtype=tf.float32) + pixel_space_mask_logits = tf.transpose(pixel_space_mask_logits, + perm=[0, 2, 3, 1]) # b, h, w, c + # class scores are 0-1 normalized beforehand. + # 3-rd column (class-2) represents `void` class scores. + transformer_class_logits = tf.convert_to_tensor( + [[ + [o, x, x], # instance-1 -- class-0 + [o, x, x], # instance-2 -- class-0 + [x, o, x], # instance-3 -- class-1 + ]], dtype=tf.float32) + + input_shape = [5, 5] + pixel_confidence_threshold = 0.4 + transformer_class_confidence_threshold = 0.7 + thing_area_limit = 3 + pieces = 1 # No piece-wise operation used. + + panoptic_maps, mask_id_maps, semantic_maps = ( + max_deeplab._get_panoptic_predictions( + pixel_space_mask_logits, transformer_class_logits, thing_class_ids, + void_label, label_divisor, thing_area_limit, stuff_area_limit, + input_shape, pixel_confidence_threshold, + transformer_class_confidence_threshold, pieces) + ) + self.assertSequenceEqual(panoptic_maps.shape, (batch, height, width)) + self.assertSequenceEqual(semantic_maps.shape, (batch, height, width)) + self.assertSequenceEqual(mask_id_maps.shape, (batch, height, width)) + expected_panoptic_maps = [[ # label_divisor = 10 + [11, 11, 11, 11, 11], # 11: semantic_id=1, instance_id=1 + [11, 12, 12, 11, 11], # 12: semantic_id=1, instance_id=2 + [23, 12, 12, 23, 23], # 23: semantic_id=2, instance_id=3 + [23, 12, 12, 23, 23], + [23, 23, 23, 23, 23], + ]] + np.testing.assert_array_equal(panoptic_maps, expected_panoptic_maps) + expected_mask_id_maps = [[ + [1, 1, 1, 1, 1], + [1, 2, 2, 1, 1], + [3, 2, 2, 3, 3], + [3, 2, 2, 3, 3], + [3, 3, 3, 3, 3], + ]] + np.testing.assert_array_equal(mask_id_maps, expected_mask_id_maps) + expected_semantic_maps = [[ + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [2, 1, 1, 2, 2], + [2, 1, 1, 2, 2], + [2, 2, 2, 2, 2], + ]] + np.testing.assert_array_equal(semantic_maps, expected_semantic_maps) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/post_processor/motion_deeplab.py b/model/post_processor/motion_deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..afd637f6e4e2d1c57a9f3f7df9d1c98c617a8cc3 --- /dev/null +++ b/model/post_processor/motion_deeplab.py @@ -0,0 +1,257 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains functions to post-process Motion-DeepLab results.""" + +from typing import Tuple + +import tensorflow as tf + + +def assign_instances_to_previous_tracks( + prev_centers: tf.Tensor, + current_centers: tf.Tensor, + heatmap: tf.Tensor, + offsets: tf.Tensor, + panoptic_map: tf.Tensor, + next_id: tf.Tensor, + label_divisor: int, + sigma=7) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: + """Greedy assignment of current centers to previous centers. + + Current centers are selected in decreasing order of confidence (heatmap + scores). These centers are transformed with the offsets and assigned to + previous centers. + + Args: + prev_centers: A tf.Tensor containing previous centers of shape [Np, 5]. This + tensor contains: + [0]: The x-coordinate. + [1]: The y-coordinate. + [2]: The panoptic ID. + [3]: The geometric mean of width and height of the instance mask. + [4]: The number of frames that no new masks got assigned to this center. + current_centers: A tf.Tensor containing centers of current frame of shape + [Nc, 5]. This tensor contains: + [0]: The x-coordinate. + [1]: The y-coordinate. + [2]: The panoptic ID. + [3]: The geometric mean of width and height of the instance mask. + [4]: The number of frames that no new masks got assigned to this center. + heatmap: A tf.Tensor of shape [batch, height, width] containing the center + heatmap. + offsets: A tf.Tensor of shape [batch, height, width, 2] containing the + center offsets. + panoptic_map: A tf.Tensor of shape [batch, height, width] containing the + panoptic segmentation. + next_id: A tf.Tensor of shape [1] containing the next ID. + label_divisor: An integer specifying the label divisor for panoptic IDs. + sigma: An optional integer specifying the number of frames that unmatched + centers should be kept (default: 7). + + Returns: + A tuple of three tf.Tensor: + 1. The updated panoptic segmentation map that contains track IDs. + 2. The updated tensor containing all current centers (including unmatched + previous ones). + 3. The updated next ID that can be used for new tracks. + """ + # Switch x and y coordinates for indexing. + center_indices = tf.concat( + [tf.zeros([tf.shape(current_centers)[0], 1], dtype=tf.int32), + current_centers[:, 1:2], current_centers[:, 0:1]], + axis=1) + confidence_scores = tf.gather_nd(heatmap, center_indices) + + scores = tf.argsort(confidence_scores, direction='DESCENDING') + cond = lambda i, *_: i < tf.shape(center_indices)[0] + + def body(i, current_centers_loop, prev_centers_loop, new_panoptic_map_loop, + next_id_loop): + row_index = scores[i] + i = tf.add(i, 1) + center_id = current_centers_loop[row_index, 2] + center_location = current_centers_loop[row_index, :2] + center_offset_yx = offsets[0, center_location[1], center_location[0], :] + center_offset_xy = center_offset_yx[::-1] + center_location = center_offset_xy + tf.cast(center_location, tf.float32) + center_sem_id = center_id // label_divisor + center_mask = tf.equal(panoptic_map, center_id) + prev_centers_class = prev_centers_loop[:, 2] // label_divisor + prev_centers_with_same_class = tf.squeeze( + tf.cast( + tf.gather( + prev_centers_loop, + tf.where(tf.equal(prev_centers_class, center_sem_id)), + axis=0), tf.float32), + axis=1) + + # Check if there are still unassigned previous centers of the same class. + if tf.shape(prev_centers_with_same_class)[0] > 0: + # For efficieny reasons, we do not take the sqrt when we compute the + # minimal distances. See render_panoptic_map_as_heatmap as well. + distances = tf.reduce_sum( + tf.square(prev_centers_with_same_class[:, :2] - center_location), + axis=1) + prev_center_index = tf.math.argmin( + distances, axis=0, output_type=tf.int32) + min_dist = distances[prev_center_index] + + # If previous center is within a certain range, continue track. + if min_dist < prev_centers_with_same_class[prev_center_index, 3]: + new_center_id = tf.cast( + prev_centers_with_same_class[prev_center_index, 2], dtype=tf.int32) + shape = new_panoptic_map_loop.get_shape() + new_panoptic_map_loop = tf.where(center_mask, new_center_id, + new_panoptic_map_loop) + new_panoptic_map_loop.set_shape(shape) + current_centers_loop = tf.tensor_scatter_nd_update( + current_centers_loop, tf.expand_dims([row_index, 2], 0), + [new_center_id]) + # Remove previous center. + prev_centers_loop = tf.squeeze( + tf.gather( + prev_centers_loop, + tf.where(tf.not_equal(prev_centers_loop[:, 2], new_center_id)), + axis=0), + axis=1) + return (i, current_centers_loop, prev_centers_loop, + new_panoptic_map_loop, next_id_loop) + else: + # Assign new track ID + new_center_id = center_sem_id * label_divisor + next_id_loop + shape = new_panoptic_map_loop.get_shape() + new_panoptic_map_loop = tf.where(center_mask, new_center_id, + new_panoptic_map_loop) + new_panoptic_map_loop.set_shape(shape) + current_centers_loop = tf.tensor_scatter_nd_update( + current_centers_loop, tf.expand_dims([row_index, 2], 0), + [new_center_id]) + next_id_loop += 1 + return (i, current_centers_loop, prev_centers_loop, + new_panoptic_map_loop, next_id_loop) + else: + # Assign new track ID + new_center_id = center_sem_id * label_divisor + next_id_loop + shape = new_panoptic_map_loop.get_shape() + new_panoptic_map_loop = tf.where(center_mask, new_center_id, + new_panoptic_map_loop) + new_panoptic_map_loop.set_shape(shape) + current_centers_loop = tf.tensor_scatter_nd_update( + current_centers_loop, tf.expand_dims([row_index, 2], 0), + [new_center_id]) + next_id_loop += 1 + return (i, current_centers_loop, prev_centers_loop, new_panoptic_map_loop, + next_id_loop) + + loop_start_index = tf.constant(0) + (_, current_centers, + unmatched_centers, new_panoptic_map, next_id) = tf.while_loop( + cond, body, + (loop_start_index, current_centers, prev_centers, panoptic_map, + next_id)) + + # Keep unmatched centers for sigma frames. + if tf.shape(unmatched_centers)[0] > 0: + current_centers = tf.concat([current_centers, unmatched_centers], axis=0) + + number_centers = tf.shape(current_centers)[0] + indices_row = tf.range(number_centers, dtype=tf.int32) + indices_column = tf.repeat([4], number_centers, axis=0) + indices = tf.stack([indices_row, indices_column], axis=1) + current_centers = tf.tensor_scatter_nd_add( + current_centers, indices, + tf.repeat([1], number_centers, axis=0)) + + # Remove centers after sigma frames. + current_centers = tf.squeeze( + tf.gather( + current_centers, + tf.where(tf.not_equal(current_centers[:, 4], sigma)), + axis=0), + axis=1) + + return new_panoptic_map, current_centers, next_id + + +def render_panoptic_map_as_heatmap( + panoptic_map: tf.Tensor, sigma: int, label_divisor: int, + void_label: int) -> Tuple[tf.Tensor, tf.Tensor]: + """Extracts centers from panoptic map and renders as heatmap.""" + gaussian_size = 6 * sigma + 3 + x = tf.range(gaussian_size, dtype=tf.float32) + y = tf.expand_dims(x, axis=1) + x0, y0 = 3 * sigma + 1, 3 * sigma + 1 + gaussian = tf.math.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)) + gaussian = tf.cast(tf.reshape(gaussian, [-1]), tf.float32) + + height = tf.shape(panoptic_map)[1] + width = tf.shape(panoptic_map)[2] + # Pad center to make boundary handling easier. + center_pad_begin = int(round(3 * sigma + 1)) + center_pad_end = int(round(3 * sigma + 2)) + center_pad = center_pad_begin + center_pad_end + + center = tf.zeros((height + center_pad, width + center_pad)) + unique_ids, _ = tf.unique(tf.reshape(panoptic_map, [-1])) + centers_and_ids = tf.TensorArray( + tf.int32, size=0, dynamic_size=True, clear_after_read=False) + counter = tf.zeros([], dtype=tf.int32) + + for panoptic_id in unique_ids: + semantic_id = panoptic_id // label_divisor + # Filter out IDs that should be ignored, are stuff classes or crowd. + # Stuff classes and crowd regions both have IDs of the form panoptic_id = + # semantic_id * label_divisor + if semantic_id == void_label or panoptic_id % label_divisor == 0: + continue + + # Convert [[0, y0, x0], ...] to [[0, ...], [y0, ...], [x0, ...]]. + mask_index = tf.cast( + tf.transpose(tf.where(panoptic_map == panoptic_id)), tf.float32) + mask_size = ( + tf.reduce_max(mask_index, axis=1) - tf.reduce_min(mask_index, axis=1)) + # The radius is defined as the geometric mean of width and height. + # For efficieny reasons, we do not take the sqrt when we compute the minimal + # distances. See assign_instances_to_previous_tracks as well. + mask_radius = tf.cast(tf.round(mask_size[1] * mask_size[2]), tf.int32) + centers = tf.reduce_mean(mask_index, axis=1) + + center_x = tf.cast(tf.round(centers[2]), tf.int32) + center_y = tf.cast(tf.round(centers[1]), tf.int32) + centers_and_ids = centers_and_ids.write( + counter, + [center_x, center_y, tf.cast(panoptic_id, tf.int32), mask_radius, 0]) + counter += 1 + + # Due to the padding with center_pad_begin in center, the computed center + # becomes the upper left corner in the center tensor. + upper_left = center_x, center_y + bottom_right = (upper_left[0] + gaussian_size, + upper_left[1] + gaussian_size) + + indices_x, indices_y = tf.meshgrid( + tf.range(upper_left[0], bottom_right[0]), + tf.range(upper_left[1], bottom_right[1])) + indices = tf.transpose( + tf.stack([tf.reshape(indices_y, [-1]), + tf.reshape(indices_x, [-1])])) + + center = tf.tensor_scatter_nd_max( + center, indices, gaussian, name='center_scatter') + + center = center[center_pad_begin:(center_pad_begin + height), + center_pad_begin:(center_pad_begin + width)] + return tf.expand_dims(center, axis=0), centers_and_ids.stack() diff --git a/model/post_processor/panoptic_deeplab.py b/model/post_processor/panoptic_deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..d1d6aa31331c9035bc741f33494bd9df95fe676b --- /dev/null +++ b/model/post_processor/panoptic_deeplab.py @@ -0,0 +1,463 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains functions to post-process Panoptic-DeepLab results.""" + +import functools +from typing import Tuple, Dict, Text + +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.data import dataset +from deeplab2.model import utils +from deeplab2.tensorflow_ops.python.ops import merge_semantic_and_instance_maps_op as merge_ops + + +def _get_semantic_predictions(semantic_logits: tf.Tensor) -> tf.Tensor: + """Computes the semantic classes from the predictions. + + Args: + semantic_logits: A tf.tensor of shape [batch, height, width, classes]. + + Returns: + A tf.Tensor containing the semantic class prediction of shape + [batch, height, width]. + """ + return tf.argmax(semantic_logits, axis=-1, output_type=tf.int32) + + +def _get_instance_centers_from_heatmap( + center_heatmap: tf.Tensor, center_threshold: float, nms_kernel_size: int, + keep_k_centers: int) -> Tuple[tf.Tensor, tf.Tensor]: + """Computes a list of instance centers. + + Args: + center_heatmap: A tf.Tensor of shape [height, width, 1]. + center_threshold: A float setting the threshold for the center heatmap. + nms_kernel_size: An integer specifying the nms kernel size. + keep_k_centers: An integer specifying the number of centers to keep (K). + Non-positive values will keep all centers. + + Returns: + A tuple of + - tf.Tensor of shape [N, 2] containing N center coordinates (after + non-maximum suppression) in (y, x) order. + - tf.Tensor of shape [height, width] containing the center heatmap after + non-maximum suppression. + """ + # Threshold center map. + center_heatmap = tf.where( + tf.greater(center_heatmap, center_threshold), center_heatmap, 0.0) + + # Non-maximum suppression. + padded_map = utils.add_zero_padding(center_heatmap, nms_kernel_size, rank=3) + pooled_center_heatmap = tf.keras.backend.pool2d( + tf.expand_dims(padded_map, 0), + pool_size=(nms_kernel_size, nms_kernel_size), + strides=(1, 1), + padding='valid', + pool_mode='max') + center_heatmap = tf.where( + tf.equal(pooled_center_heatmap, center_heatmap), center_heatmap, 0.0) + center_heatmap = tf.squeeze(center_heatmap, axis=[0, 3]) + + # `centers` is of shape (N, 2) with (y, x) order of the second dimension. + centers = tf.where(tf.greater(center_heatmap, 0.0)) + + if keep_k_centers > 0 and tf.shape(centers)[0] > keep_k_centers: + topk_scores, _ = tf.math.top_k( + tf.reshape(center_heatmap, [-1]), keep_k_centers, sorted=False) + centers = tf.where(tf.greater(center_heatmap, topk_scores[-1])) + + return centers, center_heatmap + + +def _find_closest_center_per_pixel(centers: tf.Tensor, + center_offsets: tf.Tensor) -> tf.Tensor: + """Assigns all pixels to their closest center. + + Args: + centers: A tf.Tensor of shape [N, 2] containing N centers with coordinate + order (y, x). + center_offsets: A tf.Tensor of shape [height, width, 2]. + + Returns: + A tf.Tensor of shape [height, width] containing the index of the closest + center, per pixel. + """ + height = tf.shape(center_offsets)[0] + width = tf.shape(center_offsets)[1] + + x_coord, y_coord = tf.meshgrid(tf.range(width), tf.range(height)) + coord = tf.stack([y_coord, x_coord], axis=-1) + + center_per_pixel = tf.cast(coord, tf.float32) + center_offsets + + # centers: [N, 2] -> [N, 1, 2]. + # center_per_pixel: [H, W, 2] -> [1, H*W, 2]. + centers = tf.cast(tf.expand_dims(centers, 1), tf.float32) + center_per_pixel = tf.reshape(center_per_pixel, [height*width, 2]) + center_per_pixel = tf.expand_dims(center_per_pixel, 0) + + # distances: [N, H*W]. + distances = tf.norm(centers - center_per_pixel, axis=-1) + + return tf.reshape(tf.argmin(distances, axis=0), [height, width]) + + +def _get_instances_from_heatmap_and_offset( + semantic_segmentation: tf.Tensor, center_heatmap: tf.Tensor, + center_offsets: tf.Tensor, center_threshold: float, + thing_class_ids: tf.Tensor, nms_kernel_size: int, + keep_k_centers: int) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: + """Computes the instance assignment per pixel. + + Args: + semantic_segmentation: A tf.Tensor containing the semantic labels of shape + [height, width]. + center_heatmap: A tf.Tensor of shape [height, width, 1]. + center_offsets: A tf.Tensor of shape [height, width, 2]. + center_threshold: A float setting the threshold for the center heatmap. + thing_class_ids: A tf.Tensor of shape [N] containing N thing indices. + nms_kernel_size: An integer specifying the nms kernel size. + keep_k_centers: An integer specifying the number of centers to keep. + Negative values will keep all centers. + + Returns: + A tuple of: + - tf.Tensor containing the instance segmentation (filtered with the `thing` + segmentation from the semantic segmentation output) with shape + [height, width]. + - tf.Tensor containing the processed centermap with shape [height, width]. + - tf.Tensor containing instance scores (where higher "score" is a reasonable + signal of a higher confidence detection.) Will be of shape [height, width] + with the score for a pixel being the score of the instance it belongs to. + The scores will be zero for pixels in background/"stuff" regions. + """ + thing_segmentation = tf.zeros_like(semantic_segmentation) + for thing_id in thing_class_ids: + thing_segmentation = tf.where(tf.equal(semantic_segmentation, thing_id), + 1, + thing_segmentation) + + centers, processed_center_heatmap = _get_instance_centers_from_heatmap( + center_heatmap, center_threshold, nms_kernel_size, keep_k_centers) + if tf.shape(centers)[0] == 0: + return (tf.zeros_like(semantic_segmentation), processed_center_heatmap, + tf.zeros_like(processed_center_heatmap)) + + instance_center_index = _find_closest_center_per_pixel( + centers, center_offsets) + # Instance IDs should start with 1. So we use the index into the centers, but + # shifted by 1. + instance_segmentation = tf.cast(instance_center_index, tf.int32) + 1 + + # The value of the heatmap at an instance's center is used as the score + # for that instance. + instance_scores = tf.gather_nd(processed_center_heatmap, centers) + tf.debugging.assert_shapes([ + (centers, ('N', 2)), + (instance_scores, ('N',)), + ]) + # This will map the instance scores back to the image space: where each pixel + # has a value equal to the score of its instance. + flat_center_index = tf.reshape(instance_center_index, [-1]) + instance_score_map = tf.gather(instance_scores, flat_center_index) + instance_score_map = tf.reshape(instance_score_map, + tf.shape(instance_segmentation)) + instance_score_map *= tf.cast(thing_segmentation, tf.float32) + + return (thing_segmentation * instance_segmentation, processed_center_heatmap, + instance_score_map) + + +@tf.function +def _get_panoptic_predictions( + semantic_logits: tf.Tensor, center_heatmap: tf.Tensor, + center_offsets: tf.Tensor, center_threshold: float, + thing_class_ids: tf.Tensor, label_divisor: int, stuff_area_limit: int, + void_label: int, nms_kernel_size: int, keep_k_centers: int, + merge_semantic_and_instance_with_tf_op: bool +) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]: + """Computes the semantic class and instance ID per pixel. + + Args: + semantic_logits: A tf.Tensor of shape [batch, height, width, classes]. + center_heatmap: A tf.Tensor of shape [batch, height, width, 1]. + center_offsets: A tf.Tensor of shape [batch, height, width, 2]. + center_threshold: A float setting the threshold for the center heatmap. + thing_class_ids: A tf.Tensor of shape [N] containing N thing indices. + label_divisor: An integer specifying the label divisor of the dataset. + stuff_area_limit: An integer specifying the number of pixels that stuff + regions need to have at least. The stuff region will be included in the + panoptic prediction, only if its area is larger than the limit; otherwise, + it will be re-assigned as void_label. + void_label: An integer specifying the void label. + nms_kernel_size: An integer specifying the nms kernel size. + keep_k_centers: An integer specifying the number of centers to keep. + Negative values will keep all centers. + merge_semantic_and_instance_with_tf_op: Boolean, specifying the merging + operation uses TensorFlow (CUDA kernel) implementation (True) or + tf.py_function implementation (False). Note the tf.py_function + implementation is simply used as a backup solution when you could not + successfully compile the provided TensorFlow implementation. To reproduce + our results, please use the provided TensorFlow implementation `merge_ops` + (i.e., set to True). + + Returns: + A tuple of: + - the panoptic prediction as tf.Tensor with shape [batch, height, width]. + - the semantic prediction as tf.Tensor with shape [batch, height, width]. + - the instance prediction as tf.Tensor with shape [batch, height, width]. + - the centermap prediction as tf.Tensor with shape [batch, height, width]. + - the instance score maps as tf.Tensor with shape [batch, height, width]. + """ + semantic_prediction = _get_semantic_predictions(semantic_logits) + batch_size = tf.shape(semantic_logits)[0] + + instance_map_lists = tf.TensorArray( + tf.int32, size=batch_size, dynamic_size=False) + center_map_lists = tf.TensorArray( + tf.float32, size=batch_size, dynamic_size=False) + instance_score_map_lists = tf.TensorArray( + tf.float32, size=batch_size, dynamic_size=False) + + for i in tf.range(batch_size): + (instance_map, center_map, + instance_score_map) = _get_instances_from_heatmap_and_offset( + semantic_prediction[i, ...], center_heatmap[i, ...], + center_offsets[i, ...], center_threshold, thing_class_ids, + nms_kernel_size, keep_k_centers) + instance_map_lists = instance_map_lists.write(i, instance_map) + center_map_lists = center_map_lists.write(i, center_map) + instance_score_map_lists = instance_score_map_lists.write( + i, instance_score_map) + + # This does not work with unknown shapes. + instance_maps = instance_map_lists.stack() + center_maps = center_map_lists.stack() + instance_score_maps = instance_score_map_lists.stack() + + if merge_semantic_and_instance_with_tf_op: + panoptic_prediction = merge_ops.merge_semantic_and_instance_maps( + semantic_prediction, instance_maps, thing_class_ids, label_divisor, + stuff_area_limit, void_label) + else: + panoptic_prediction = _merge_semantic_and_instance_maps( + semantic_prediction, instance_maps, thing_class_ids, label_divisor, + stuff_area_limit, void_label) + return (panoptic_prediction, semantic_prediction, instance_maps, center_maps, + instance_score_maps) + + +@tf.function +def _merge_semantic_and_instance_maps( + semantic_prediction: tf.Tensor, + instance_maps: tf.Tensor, + thing_class_ids: tf.Tensor, + label_divisor: int, + stuff_area_limit: int, + void_label: int) -> tf.Tensor: + """Merges semantic and instance maps to obtain panoptic segmentation. + + This function merges the semantic segmentation and class-agnostic + instance segmentation to form the panoptic segmentation. In particular, + the class label of each instance mask is inferred from the majority + votes from the corresponding pixels in the semantic segmentation. This + operation is first poposed in the DeeperLab paper and adopted by the + Panoptic-DeepLab. + + - DeeperLab: Single-Shot Image Parser, T-J Yang, et al. arXiv:1902.05093. + - Panoptic-DeepLab, B. Cheng, et al. In CVPR, 2020. + + Note that this function only supports batch = 1 for simplicity. Additionally, + this function has a slightly different implementation from the provided + TensorFlow implementation `merge_ops` but with a similar performance. This + function is mainly used as a backup solution when you could not successfully + compile the provided TensorFlow implementation. To reproduce our results, + please use the provided TensorFlow implementation (i.e., not use this + function, but the `merge_ops.merge_semantic_and_instance_maps`). + + Args: + semantic_prediction: A tf.Tensor of shape [batch, height, width]. + instance_maps: A tf.Tensor of shape [batch, height, width]. + thing_class_ids: A tf.Tensor of shape [N] containing N thing indices. + label_divisor: An integer specifying the label divisor of the dataset. + stuff_area_limit: An integer specifying the number of pixels that stuff + regions need to have at least. The stuff region will be included in the + panoptic prediction, only if its area is larger than the limit; otherwise, + it will be re-assigned as void_label. + void_label: An integer specifying the void label. + + Returns: + panoptic_prediction: A tf.Tensor with shape [batch, height, width]. + """ + prediction_shape = semantic_prediction.get_shape().as_list() + # This implementation only supports batch size of 1. Since model construction + # might lose batch size information (and leave it to None), override it here. + prediction_shape[0] = 1 + semantic_prediction = tf.ensure_shape(semantic_prediction, prediction_shape) + instance_maps = tf.ensure_shape(instance_maps, prediction_shape) + + # Default panoptic_prediction to have semantic label = void_label. + panoptic_prediction = tf.ones_like( + semantic_prediction) * void_label * label_divisor + + # Start to paste predicted `thing` regions to panoptic_prediction. + # Infer `thing` segmentation regions from semantic prediction. + semantic_thing_segmentation = tf.zeros_like(semantic_prediction, + dtype=tf.bool) + for thing_class in thing_class_ids: + semantic_thing_segmentation = tf.math.logical_or( + semantic_thing_segmentation, + semantic_prediction == thing_class) + # Keep track of how many instances for each semantic label. + num_instance_per_semantic_label = tf.TensorArray( + tf.int32, size=0, dynamic_size=True, clear_after_read=False) + instance_ids, _ = tf.unique(tf.reshape(instance_maps, [-1])) + for instance_id in instance_ids: + # Instance ID 0 is reserved for crowd region. + if instance_id == 0: + continue + thing_mask = tf.math.logical_and(instance_maps == instance_id, + semantic_thing_segmentation) + if tf.reduce_sum(tf.cast(thing_mask, tf.int32)) == 0: + continue + semantic_bin_counts = tf.math.bincount( + tf.boolean_mask(semantic_prediction, thing_mask)) + semantic_majority = tf.cast( + tf.math.argmax(semantic_bin_counts), tf.int32) + + while num_instance_per_semantic_label.size() <= semantic_majority: + num_instance_per_semantic_label = num_instance_per_semantic_label.write( + num_instance_per_semantic_label.size(), 0) + + new_instance_id = ( + num_instance_per_semantic_label.read(semantic_majority) + 1) + num_instance_per_semantic_label = num_instance_per_semantic_label.write( + semantic_majority, new_instance_id) + panoptic_prediction = tf.where( + thing_mask, + tf.ones_like(panoptic_prediction) * semantic_majority * label_divisor + + new_instance_id, + panoptic_prediction) + + # Done with `num_instance_per_semantic_label` tensor array. + num_instance_per_semantic_label.close() + + # Start to paste predicted `stuff` regions to panoptic prediction. + instance_stuff_regions = instance_maps == 0 + semantic_ids, _ = tf.unique(tf.reshape(semantic_prediction, [-1])) + for semantic_id in semantic_ids: + if tf.reduce_sum(tf.cast(thing_class_ids == semantic_id, tf.int32)) > 0: + continue + # Check stuff area. + stuff_mask = tf.math.logical_and(semantic_prediction == semantic_id, + instance_stuff_regions) + stuff_area = tf.reduce_sum(tf.cast(stuff_mask, tf.int32)) + if stuff_area >= stuff_area_limit: + panoptic_prediction = tf.where( + stuff_mask, + tf.ones_like(panoptic_prediction) * semantic_id * label_divisor, + panoptic_prediction) + + return panoptic_prediction + + +class SemanticOnlyPostProcessor(tf.keras.layers.Layer): + """This class contains code of a semantic only post-processor.""" + + def __init__(self): + """Initializes a semantic only post-processor.""" + super(SemanticOnlyPostProcessor, self).__init__( + name='SemanticOnlyPostProcessor') + + def call(self, result_dict: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]: + """Performs the post-processing given model predicted results. + + Args: + result_dict: A dictionary of tf.Tensor containing model results. The dict + has to contain + - common.PRED_SEMANTIC_PROBS_KEY, + + Returns: + The post-processed dict of tf.Tensor, containing the following: + - common.PRED_SEMANTIC_KEY, + """ + processed_dict = {} + processed_dict[common.PRED_SEMANTIC_KEY] = _get_semantic_predictions( + result_dict[common.PRED_SEMANTIC_PROBS_KEY]) + return processed_dict + + +class PostProcessor(tf.keras.layers.Layer): + """This class contains code of a Panoptic-Deeplab post-processor.""" + + def __init__( + self, + config: config_pb2.ExperimentOptions, + dataset_descriptor: dataset.DatasetDescriptor): + """Initializes a Panoptic-Deeplab post-processor. + + Args: + config: A config_pb2.ExperimentOptions configuration. + dataset_descriptor: A dataset.DatasetDescriptor. + """ + super(PostProcessor, self).__init__(name='PostProcessor') + self._post_processor = functools.partial( + _get_panoptic_predictions, + center_threshold=config.evaluator_options.center_score_threshold, + thing_class_ids=tf.convert_to_tensor( + dataset_descriptor.class_has_instances_list), + label_divisor=dataset_descriptor.panoptic_label_divisor, + stuff_area_limit=config.evaluator_options.stuff_area_limit, + void_label=dataset_descriptor.ignore_label, + nms_kernel_size=config.evaluator_options.nms_kernel, + keep_k_centers=config.evaluator_options.keep_k_centers, + merge_semantic_and_instance_with_tf_op=( + config.evaluator_options.merge_semantic_and_instance_with_tf_op), + ) + + def call(self, result_dict: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]: + """Performs the post-processing given model predicted results. + + Args: + result_dict: A dictionary of tf.Tensor containing model results. The dict + has to contain + - common.PRED_SEMANTIC_PROBS_KEY, + - common.PRED_CENTER_HEATMAP_KEY, + - common.PRED_OFFSET_MAP_KEY, + + Returns: + The post-processed dict of tf.Tensor, containing the following: + - common.PRED_SEMANTIC_KEY, + - common.PRED_INSTANCE_KEY, + - common.PRED_PANOPTIC_KEY, + - common.PRED_INSTANCE_CENTER_KEY, + - common.PRED_INSTANCE_SCORES_KEY, + """ + processed_dict = {} + (processed_dict[common.PRED_PANOPTIC_KEY], + processed_dict[common.PRED_SEMANTIC_KEY], + processed_dict[common.PRED_INSTANCE_KEY], + processed_dict[common.PRED_INSTANCE_CENTER_KEY], + processed_dict[common.PRED_INSTANCE_SCORES_KEY] + ) = self._post_processor( + result_dict[common.PRED_SEMANTIC_PROBS_KEY], + result_dict[common.PRED_CENTER_HEATMAP_KEY], + result_dict[common.PRED_OFFSET_MAP_KEY]) + return processed_dict diff --git a/model/post_processor/panoptic_deeplab_test.py b/model/post_processor/panoptic_deeplab_test.py new file mode 100644 index 0000000000000000000000000000000000000000..01b23656a1854718af2a73225e8a441efcafe0eb --- /dev/null +++ b/model/post_processor/panoptic_deeplab_test.py @@ -0,0 +1,141 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test for panoptic_deeplab.py.""" +import numpy as np +import tensorflow as tf + +from deeplab2.model.post_processor import panoptic_deeplab + + +class PostProcessingTest(tf.test.TestCase): + + def test_py_func_merge_semantic_and_instance_maps_can_run(self): + batch = 1 + height = 5 + width = 5 + semantic_prediction = tf.random.uniform((batch, height, width), + minval=0, + maxval=20, + dtype=tf.int32) + instance_maps = tf.random.uniform((batch, height, width), + minval=0, + maxval=3, + dtype=tf.int32) + thing_class_ids = tf.convert_to_tensor([1, 2, 3]) + label_divisor = 256 + stuff_area_limit = 3 + void_label = 255 + panoptic_prediction = panoptic_deeplab._merge_semantic_and_instance_maps( + semantic_prediction, instance_maps, thing_class_ids, label_divisor, + stuff_area_limit, void_label) + self.assertListEqual(semantic_prediction.get_shape().as_list(), + panoptic_prediction.get_shape().as_list()) + + def test_merge_semantic_and_instance_maps_with_a_simple_example(self): + semantic_prediction = tf.convert_to_tensor( + [[[0, 0, 0, 0], + [0, 1, 1, 0], + [0, 2, 2, 0], + [2, 2, 3, 3]]], dtype=tf.int32) + instance_maps = tf.convert_to_tensor( + [[[0, 0, 0, 0], + [0, 0, 0, 0], + [0, 1, 1, 0], + [2, 2, 3, 3]]], dtype=tf.int32) + thing_class_ids = tf.convert_to_tensor([2, 3]) + label_divisor = 256 + stuff_area_limit = 3 + void_label = 255 + # The expected_panoptic_prediction is computed as follows. + # For `thing` segmentation, instance 1, 2, and 3 are kept, but instance 3 + # will have a new instance ID 1, since it is the first instance in its + # own semantic label. + # For `stuff` segmentation, class-0 region is kept, while class-1 region + # is re-labeled as `void_label * label_divisor` since its area is smaller + # than stuff_area_limit. + expected_panoptic_prediction = tf.convert_to_tensor( + [[[0, 0, 0, 0], + [0, void_label * label_divisor, void_label * label_divisor, 0], + [0, 2 * label_divisor + 1, 2 * label_divisor + 1, 0], + [2 * label_divisor + 2, 2 * label_divisor + 2, 3 * label_divisor + 1, + 3 * label_divisor + 1]]], dtype=tf.int32) + panoptic_prediction = panoptic_deeplab._merge_semantic_and_instance_maps( + semantic_prediction, instance_maps, thing_class_ids, label_divisor, + stuff_area_limit, void_label) + np.testing.assert_equal(expected_panoptic_prediction.numpy(), + panoptic_prediction.numpy()) + + def test_gets_panoptic_predictions_with_score(self): + batch = 1 + height = 5 + width = 5 + classes = 3 + + semantic_logits = tf.random.uniform((batch, 1, 1, classes)) + semantic_logits = tf.tile(semantic_logits, (1, height, width, 1)) + + center_heatmap = tf.convert_to_tensor([ + [1.0, 0.0, 0.0, 0.0, 0.0], + [0.8, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.1, 0.7], + [0.0, 0.0, 0.0, 0.0, 0.2], + ], + dtype=tf.float32) + center_heatmap = tf.expand_dims(center_heatmap, 0) + center_heatmap = tf.expand_dims(center_heatmap, 3) + + center_offsets = tf.zeros((batch, height, width, 2)) + center_threshold = 0.0 + thing_class_ids = tf.range(classes) # No "stuff" classes. + label_divisor = 256 + stuff_area_limit = 16 + void_label = classes + nms_kernel_size = 3 + keep_k_centers = 2 + merge_semantic_and_instance_with_tf_op = True + + result = panoptic_deeplab._get_panoptic_predictions( + semantic_logits, center_heatmap, center_offsets, center_threshold, + thing_class_ids, label_divisor, stuff_area_limit, void_label, + nms_kernel_size, keep_k_centers, merge_semantic_and_instance_with_tf_op) + instance_maps = result[2].numpy() + instance_scores = result[4].numpy() + + self.assertSequenceEqual(instance_maps.shape, (batch, height, width)) + expected_instances = [[ + [1, 1, 1, 1, 2], + [1, 1, 1, 2, 2], + [1, 1, 2, 2, 2], + [1, 2, 2, 2, 2], + [1, 2, 2, 2, 2], + ]] + np.testing.assert_array_equal(instance_maps, expected_instances) + + self.assertSequenceEqual(instance_scores.shape, (batch, height, width)) + expected_instance_scores = [[ + [1.0, 1.0, 1.0, 1.0, 0.7], + [1.0, 1.0, 1.0, 0.7, 0.7], + [1.0, 1.0, 0.7, 0.7, 0.7], + [1.0, 0.7, 0.7, 0.7, 0.7], + [1.0, 0.7, 0.7, 0.7, 0.7], + ]] + np.testing.assert_array_almost_equal(instance_scores, + expected_instance_scores) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/post_processor/post_processor_builder.py b/model/post_processor/post_processor_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..1ca93928236718d510eb65457cfe3da09c72efb5 --- /dev/null +++ b/model/post_processor/post_processor_builder.py @@ -0,0 +1,45 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains a post-processor builder used in the DeepLab model.""" + +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.data import dataset +from deeplab2.model import utils +from deeplab2.model.post_processor import max_deeplab +from deeplab2.model.post_processor import panoptic_deeplab + + +def get_post_processor( + config: config_pb2.ExperimentOptions, + dataset_descriptor: dataset.DatasetDescriptor) -> tf.keras.layers.Layer: + """Initializes a DeepLab post-processor. + + Args: + config: A config_pb2.ExperimentOptions configuration. + dataset_descriptor: A dataset.DatasetDescriptor. + + Returns: + PostProcessor: A post-processor depending on the configuration. + """ + supported_tasks = utils.get_supported_tasks(config) + if config.model_options.WhichOneof('meta_architecture') == 'max_deeplab': + return max_deeplab.PostProcessor(config, dataset_descriptor) + if common.TASK_PANOPTIC_SEGMENTATION in supported_tasks: + return panoptic_deeplab.PostProcessor(config, dataset_descriptor) + return panoptic_deeplab.SemanticOnlyPostProcessor() diff --git a/model/post_processor/post_processor_builder_test.py b/model/post_processor/post_processor_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7c6ad49c819b7651fd3671332f84ce9bbc8f843e --- /dev/null +++ b/model/post_processor/post_processor_builder_test.py @@ -0,0 +1,77 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for post_processor_builder.py.""" + +import tensorflow as tf + +from google.protobuf import text_format +from deeplab2 import common +from deeplab2 import config_pb2 +from deeplab2.data import dataset +from deeplab2.model.post_processor import post_processor_builder + + +class EvaluatorTest(tf.test.TestCase): + + def test_evaluates_panoptic_deeplab_model(self): + experiment_options_textproto = """ + experiment_name: "evaluation_test" + eval_dataset_options { + dataset: "cityscapes_panoptic" + file_pattern: "EMPTY" + batch_size: 1 + crop_size: 1025 + crop_size: 2049 + # Skip resizing. + min_resize_value: 0 + max_resize_value: 0 + } + evaluator_options { + continuous_eval_timeout: 43200 + stuff_area_limit: 2048 + center_score_threshold: 0.1 + nms_kernel: 13 + save_predictions: true + save_raw_predictions: false + } + """ + config = text_format.Parse(experiment_options_textproto, + config_pb2.ExperimentOptions()) + config.model_options.panoptic_deeplab.instance.enable = True + post_processor = post_processor_builder.get_post_processor( + config, dataset.CITYSCAPES_PANOPTIC_INFORMATION) + + result_dict = { + common.PRED_SEMANTIC_PROBS_KEY: + tf.zeros([1, 1025, 2049, 19], dtype=tf.float32), + common.PRED_CENTER_HEATMAP_KEY: + tf.zeros([1, 1025, 2049, 1], dtype=tf.float32), + common.PRED_OFFSET_MAP_KEY: + tf.zeros([1, 1025, 2049, 2], dtype=tf.float32) + } + processed_dict = post_processor(result_dict) + expected_keys = { + common.PRED_PANOPTIC_KEY, + common.PRED_SEMANTIC_KEY, + common.PRED_INSTANCE_KEY, + common.PRED_INSTANCE_CENTER_KEY, + common.PRED_INSTANCE_SCORES_KEY + } + self.assertCountEqual(processed_dict.keys(), expected_keys) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/post_processor/vip_deeplab.py b/model/post_processor/vip_deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..552841110d94b053776e0539353f835e8ae095a8 --- /dev/null +++ b/model/post_processor/vip_deeplab.py @@ -0,0 +1,106 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains functions to post-process ViP-DeepLab results.""" + +import numpy as np + + +def stitch_video_panoptic_prediction( + concat_panoptic: np.ndarray, + next_panoptic: np.ndarray, + label_divisor: int, + overlap_offset: int = 128, + combine_offset: int = 2 ** 32) -> np.ndarray: + """The stitching algorithm in ViP-DeepLab. + + This function stitches a pair of image panoptic predictions to form video + panoptic predictions by propagating instance IDs from concat_panoptic to + next_panoptic based on IoU matching. + + Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen. + "ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic + Segmentation." CVPR, 2021. + + Args: + concat_panoptic: Panoptic prediction of the next frame by concatenating + it with the current frame. + next_panoptic: Panoptic prediction of the next frame. + label_divisor: An integer specifying the label divisor of the dataset. + overlap_offset: An integer offset to avoid overlap between the IDs in + next_panoptic and the propagated IDs from concat_panoptic. + combine_offset: An integer offset to combine concat and next panoptic. + + Returns: + Panoptic prediction of the next frame with the instance IDs propragated + from the concatenated panoptic prediction. + """ + def _ids_to_counts(id_array: np.ndarray): + """Given a numpy array, a mapping from each entry to its count.""" + ids, counts = np.unique(id_array, return_counts=True) + return dict(zip(ids, counts)) + new_panoptic = next_panoptic.copy() + # Increase the panoptic instance ID to avoid overlap. + new_category = new_panoptic // label_divisor + new_instance = new_panoptic % label_divisor + # We skip 0 which is reserved for crowd. + instance_mask = new_instance > 0 + new_instance[instance_mask] = new_instance[instance_mask] + overlap_offset + new_panoptic = new_category * label_divisor + new_instance + # Pre-compute areas for all the segments. + concat_segment_areas = _ids_to_counts(concat_panoptic) + next_segment_areas = _ids_to_counts(next_panoptic) + # Combine concat_panoptic and next_panoptic. + intersection_id_array = (concat_panoptic.astype(np.int64) * + combine_offset + next_panoptic.astype(np.int64)) + intersection_areas = _ids_to_counts(intersection_id_array) + # Compute IoU and sort them. + intersection_ious = [] + for intersection_id, intersection_area in intersection_areas.items(): + concat_panoptic_label = int(intersection_id // combine_offset) + next_panoptic_label = int(intersection_id % combine_offset) + concat_category_label = concat_panoptic_label // label_divisor + next_category_label = next_panoptic_label // label_divisor + if concat_category_label != next_category_label: + continue + concat_instance_label = concat_panoptic_label % label_divisor + next_instance_label = next_panoptic_label % label_divisor + # We skip 0 which is reserved for crowd. + if concat_instance_label == 0 or next_instance_label == 0: + continue + union = ( + concat_segment_areas[concat_panoptic_label] + + next_segment_areas[next_panoptic_label] - + intersection_area) + iou = intersection_area / union + intersection_ious.append([ + concat_panoptic_label, next_panoptic_label, iou]) + intersection_ious = sorted( + intersection_ious, key=lambda e: e[2]) + # Build mapping and inverse mapping. Two-way mapping guarantees 1-to-1 + # matching. + map_concat_to_next = {} + map_next_to_concat = {} + for (concat_panoptic_label, next_panoptic_label, + iou) in intersection_ious: + map_concat_to_next[concat_panoptic_label] = next_panoptic_label + map_next_to_concat[next_panoptic_label] = concat_panoptic_label + # Match and propagate. + for (concat_panoptic_label, + next_panoptic_label) in map_concat_to_next.items(): + if map_next_to_concat[next_panoptic_label] == concat_panoptic_label: + propagate_mask = next_panoptic == next_panoptic_label + new_panoptic[propagate_mask] = concat_panoptic_label + return new_panoptic diff --git a/model/post_processor/vip_deeplab_test.py b/model/post_processor/vip_deeplab_test.py new file mode 100644 index 0000000000000000000000000000000000000000..e742fe470f5d2410b5c69005130977e9ee50e8a0 --- /dev/null +++ b/model/post_processor/vip_deeplab_test.py @@ -0,0 +1,67 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test for vip_deeplab.py.""" +import numpy as np +import tensorflow as tf + +from deeplab2.model.post_processor import vip_deeplab + + +class PostProcessingTest(tf.test.TestCase): + + def test_stitch_video_panoptic_prediction(self): + concat_semantic = np.array( + [[[0, 0, 0, 0], + [0, 1, 1, 0], + [0, 2, 2, 0], + [2, 2, 3, 3]]], dtype=np.int32) + concat_instance = np.array( + [[[1, 1, 2, 2], + [1, 0, 0, 2], + [1, 1, 1, 2], + [2, 2, 1, 1]]], dtype=np.int32) + next_semantic = np.array( + [[[0, 1, 1, 0], + [0, 1, 1, 0], + [0, 2, 2, 0], + [2, 2, 3, 3]]], dtype=np.int32) + next_instance = np.array( + [[[2, 0, 0, 1], + [2, 0, 0, 1], + [2, 4, 4, 1], + [5, 5, 3, 3]]], dtype=np.int32) + label_divisor = 1000 + concat_panoptic = concat_semantic * label_divisor + concat_instance + next_panoptic = next_semantic * label_divisor + next_instance + new_panoptic = vip_deeplab.stitch_video_panoptic_prediction( + concat_panoptic, + next_panoptic, + label_divisor) + # The expected instance is manually computed. It should receive the IDs + # propagated from concat_instance by IoU matching between concat_panoptic + # and next_panoptic. + expected_semantic = next_semantic + expected_instance = np.array( + [[[1, 0, 0, 2], + [1, 0, 0, 2], + [1, 1, 1, 2], + [2, 2, 1, 1]]], dtype=np.int32) + expected_panoptic = expected_semantic * label_divisor + expected_instance + np.testing.assert_array_equal(expected_panoptic, new_panoptic) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/test_utils.py b/model/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2c0933405a86927eeab8ffa5ed076b88a88738f7 --- /dev/null +++ b/model/test_utils.py @@ -0,0 +1,31 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains utility functions for the model tests.""" +import numpy as np +import tensorflow as tf + + +def create_test_input(batch, height, width, channels): + """Creates test input tensor.""" + input_tensor = np.tile( + np.reshape( + np.reshape(np.arange(height), [height, 1]) + + np.reshape(np.arange(width), [1, width]), + [1, height, width, 1]), + [batch, 1, 1, channels]) + # Normalize the input tensor so that the outputs are not too large. + input_tensor = (input_tensor * 2 / np.max(input_tensor)) - 1 + return tf.cast(input_tensor, tf.float32) diff --git a/model/test_utils_test.py b/model/test_utils_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b0b676228beedca7ccd01fbe9bf3f7806497b2f3 --- /dev/null +++ b/model/test_utils_test.py @@ -0,0 +1,32 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for test_utils.""" + +import tensorflow as tf + +from deeplab2.model import test_utils + + +class TestUtilsTest(tf.test.TestCase): + + def test_create_test_input(self): + input_shape = [1, 2, 3, 4] + input_tensor = test_utils.create_test_input(*input_shape) + self.assertListEqual(input_tensor.get_shape().as_list(), input_shape) + + +if __name__ == '__main__': + tf.test.main() diff --git a/model/utils.py b/model/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b28a19ea3b18c8eff5039a2c6eb2270e197c8a20 --- /dev/null +++ b/model/utils.py @@ -0,0 +1,485 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This file contains utility functions for the model code.""" + +from typing import Any, List, MutableMapping, MutableSequence, Optional, Set + +import tensorflow as tf + +from deeplab2 import common +from deeplab2 import config_pb2 + +layers = tf.keras.layers + +_PREDICTION_WITH_NEAREST_UPSAMPLING = ( + common.PRED_INSTANCE_KEY, + common.PRED_INSTANCE_CENTER_KEY, + common.PRED_INSTANCE_SCORES_KEY, + common.PRED_PANOPTIC_KEY, + common.PRED_SEMANTIC_KEY, + common.PRED_NEXT_PANOPTIC_KEY, + common.PRED_CONCAT_NEXT_PANOPTIC_KEY, + common.PRED_CENTER_HEATMAP_KEY, +) + +_PREDICTION_WITH_BILINEAR_UPSAMPLING = ( + common.PRED_SEMANTIC_PROBS_KEY, + common.PRED_OFFSET_MAP_KEY, +) + +_INPUT_WITH_NEAREST_UPSAMPLING = ( + common.GT_INSTANCE_CENTER_KEY, +) + +_INPUT_WITH_BILINEAR_UPSAMPLING = ( + common.IMAGE, + common.GT_INSTANCE_REGRESSION_KEY +) + + +def _scale_helper(value, scale): + if isinstance(value, tf.Tensor): + return tf.cast( + (tf.cast(value, dtype=tf.float32) - 1.0) * scale + 1.0, + dtype=tf.int32) + else: + return int((float(value) - 1.0) * scale + 1.0) + + +def scale_mutable_sequence(input_sequence: MutableSequence[int], + scale: float) -> MutableSequence[int]: + return [_scale_helper(x, scale) for x in input_sequence] + + +def scale_int_list(int_list, scale): + return [int(x * scale) for x in int_list] + + +def undo_image_preprocessing(image_in: tf.Tensor, method: str, + perform_crop: bool, + regions_to_crop: List[int], + output_shape: List[int]) -> tf.Tensor: + """Undoes the image preprocessing. + + In particular, this function slices out the valid regions (determined by + `regions_to_crop`) in the input when perform_crop is True. After + that, we resize the results to the desired `output_shape`. + + Args: + image_in: Input image Tensor with shape [batch, height, width, n_channels]. + method: Image resize method. + perform_crop: Boolean, performing crop or not. + regions_to_crop: The regions to crop [height, width]. Will only apply + cropping at the bottom right. + output_shape: Desired shape after resizing [height, width]. + + Returns: + Outputs after cropping (if perform_crop = True) and resizing. + """ + if perform_crop: + image_out = image_in[ + :, :regions_to_crop[0], :regions_to_crop[1], :] + else: + image_out = image_in + return resize_align_corners(image_out, output_shape, method=method) + + +def undo_preprocessing(input_or_prediction_dict: MutableMapping[str, Any], + regions_to_crop: List[int], + output_shape: List[int]) -> MutableMapping[str, Any]: + """Undoes preprocessing for predictions. + + Args: + input_or_prediction_dict: A dictionary storing different types of inputs or + predictions. + regions_to_crop: The regions to crop [height, width]. Will only apply + cropping at the bottom right. + output_shape: Desired shape after resizing [height, width]. + + Returns: + inputs or predictions after cropping (if perform_crop = True) and resizing. + """ + for key in input_or_prediction_dict.keys(): + if key in _PREDICTION_WITH_NEAREST_UPSAMPLING or key in _INPUT_WITH_NEAREST_UPSAMPLING: + input_or_prediction_dict[key] = tf.squeeze( + undo_image_preprocessing( + tf.expand_dims(input_or_prediction_dict[key], 3), + 'nearest', + perform_crop=True, + regions_to_crop=regions_to_crop, + output_shape=output_shape), + axis=3) + elif key in _PREDICTION_WITH_BILINEAR_UPSAMPLING or key in _INPUT_WITH_BILINEAR_UPSAMPLING: + input_or_prediction_dict[key] = undo_image_preprocessing( + input_or_prediction_dict[key], + 'bilinear', + perform_crop=True, + regions_to_crop=regions_to_crop, + output_shape=output_shape) + else: + # We only undo preprocessing for those defined in + # _{PREDICTION,INPUT}_WITH_{NEAREST,BILINEAR}_UPSAMPLING. + # Other intermediate results are skipped. + continue + return input_or_prediction_dict + + +def add_zero_padding(input_tensor: tf.Tensor, kernel_size: int, + rank: int) -> tf.Tensor: + """Adds zero-padding to the input_tensor.""" + pad_total = kernel_size - 1 + pad_begin = pad_total // 2 + pad_end = pad_total - pad_begin + if rank == 3: + return tf.pad( + input_tensor, + paddings=[[pad_begin, pad_end], [pad_begin, pad_end], [0, 0]]) + else: + return tf.pad( + input_tensor, + paddings=[[0, 0], [pad_begin, pad_end], [pad_begin, pad_end], [0, 0]]) + + +def resize_and_rescale_offsets(input_tensor: tf.Tensor, target_size): + """Bilinearly resizes and rescales the offsets. + + Args: + input_tensor: A tf.Tensor of shape [batch, height, width, 2]. + target_size: A list or tuple or 1D tf.Tensor that specifies the height and + width after resizing. + + Returns: + The input_tensor resized to shape `[batch, target_height, target_width, 2]`. + Moreover, the offsets along the y-axis are rescaled by a factor equal to + (target_height - 1) / (reference_height - 1) and the offsets along the + x-axis are rescaled by a factor equal to + (target_width - 1) / (reference_width - 1). + """ + input_size_y = tf.shape(input_tensor)[1] + input_size_x = tf.shape(input_tensor)[2] + + scale_y = tf.cast(target_size[0] - 1, tf.float32) / tf.cast( + input_size_y - 1, tf.float32) + scale_x = tf.cast(target_size[1] - 1, tf.float32) / tf.cast( + input_size_x - 1, tf.float32) + + target_y, target_x = tf.split( + value=input_tensor, num_or_size_splits=2, axis=3) + target_y *= scale_y + target_x *= scale_x + target = tf.concat([target_y, target_x], 3) + return resize_bilinear(target, target_size) + + +def resize_align_corners(input_tensor, target_size, method='bilinear'): + """Resizes the input_tensor to target_size. + + This returns the same output as tf.compat.v1.image.resize(input_tensor, + target_size, align_corners=True). + + Args: + input_tensor: A tf.Tensor of shape [batch, height, width, channels]. + target_size: A list or tuple or 1D tf.Tensor that specifies the height and + width after resizing. + method: An optional string specifying the method used for resizing. + Supported options are 'nearest' and 'bilinear'. + + Returns: + The resized tensor. + + Raises: + ValueError: An error occurs if 1) the input tensor's rank is not 4 or 2) the + resizing method is not supported. + """ + if method == 'bilinear': + tf_method = tf.compat.v1.image.ResizeMethod.BILINEAR + elif method == 'nearest': + tf_method = tf.compat.v1.image.ResizeMethod.NEAREST_NEIGHBOR + else: + raise ValueError('The given method %s is not supported. Please use bilinear' + ' or nearest.' % method) + + tf.debugging.assert_rank( + input_tensor, 4, + message='Input tensor to resize method should have rank of 4.') + + return tf.compat.v1.image.resize( + input_tensor, + target_size, + method=tf_method, + align_corners=True, + name='resize_align_corners') + + +def resize_bilinear(images, + size, + align_corners=True, + name=None): + """TPU memory efficient version of tf.compat.v1.image.resize_bilinear. + + ResizeBilinear on TPU requires padded batch and channel dimensions. On a + TPUv3, the worst case could lead to 256x memory consumption, if the + input is, for example, [1, 257, 513, 1]. In this function, we replace the + default resize_bilinear by two resize_bilinear operations, which put one image + axis on the channel axis. This reduces TPU padding when batch * channel is + small and height * width is large. + + Args: + images: Input image of shape [B, H, W, C]. + size: A list of two elements: [height, width]. The new size for the images. + align_corners: Whether to align corners of the image. + name: Name of the operation. + + Returns: + Resized image. + """ + _, height, width, channel = images.get_shape().as_list() + if height == size[0] and width == size[1]: + return images + dtype = images.dtype + images = tf.cast(images, tf.float32) + # We check the channel axis only since the batch size is similar (usually 1 or + # 2). In this way, this if-else easily supports dynamic batch size without + # using tf.cond(). + if channel > 32 or not align_corners: + images = tf.compat.v1.image.resize_bilinear( + images, size, + align_corners=align_corners, + name=name) + else: + images = tf.transpose(images, [0, 3, 1, 2]) + images = tf.compat.v1.image.resize_bilinear( + images, [channel, size[0]], + align_corners=align_corners, + name=name + '_height' if name else None) + images = tf.transpose(images, [0, 1, 3, 2]) + images = tf.compat.v1.image.resize_bilinear( + images, [channel, size[1]], + align_corners=align_corners, + name=name + '_width' if name else None) + images = tf.transpose(images, [0, 3, 2, 1]) + return tf.cast(images, dtype) + + +def make_divisible(value: float, + divisor: int, + min_value: Optional[float] = None) -> int: + """Ensures all layers have channels that are divisible by the divisor. + + Args: + value: A `float` of original value. + divisor: An `int` of the divisor that needs to be checked upon. + min_value: A `float` of minimum value threshold. + + Returns: + The adjusted value in `int` that is divisible by divisor. + + Raises: + ValueError: Minimual value should be divisible by divisor. + """ + if min_value is None: + min_value = divisor + elif min_value % divisor != 0: + raise ValueError('Minimual value should be divisible by divisor.') + + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_value < 0.9 * value: + new_value += divisor + return int(new_value) + + +def transpose_and_reshape_for_attention_operation(inputs): + """Sequentially transposes and reshapes the tensor. + + Args: + inputs: An input [batch, num_heads, length, channel] tensor. + + Returns: + output: An output [batch, length, num_heads * channel] tensor. + """ + _, num_heads, length, channel = inputs.get_shape().as_list() + transposed_inputs = tf.transpose(inputs, [0, 2, 1, 3]) + return tf.reshape(transposed_inputs, [-1, length, num_heads * channel]) + + +def reshape_and_transpose_for_attention_operation(inputs, num_heads): + """Sequentially reshapes and transposes the tensor. + + Args: + inputs: An input [batch, length, num_heads * channel] tensor. + num_heads: An integer, the number of attention heads. + + Returns: + output: An output [batch, num_heads, length, channel] tensor. + """ + _, length, channels = inputs.get_shape().as_list() + inputs = tf.reshape(inputs, [-1, length, num_heads, channels // num_heads]) + return tf.transpose(inputs, [0, 2, 1, 3]) + + +def get_layer_name(private_attribute_name): + if private_attribute_name[0] != '_': + raise ValueError('Private attribute name should start with a \'_\'.') + return private_attribute_name[1:] + + +def get_stem_current_name(index): + return '_basic_block{}'.format(index + 1) + + +def get_low_level_conv_fusion_conv_current_names(index): + return ('_low_level_conv{}'.format(index + 1), + '_fusion_conv{}'.format(index + 1)) + + +def get_conv_bn_act_current_name(index, use_bn, activation): + name = '_conv{}'.format(index + 1) + if use_bn: + name += '_bn' + if (activation is not None and + activation.lower() != 'none' and + activation.lower() != 'linear'): + name += '_act' + return name + + +def safe_setattr(obj, name, value): + """A conflict-safe version of setattr(). + + Different from setattr(), this function raises ValueError if the object + already has an attribute with the same name. + + Args: + obj: An object whose attribute has to be set. + name: A string, the name of the attribute. + value: Any type, the value given to the attribute. + + Raises: + ValueError: If the object already has an attribute with the same name. + """ + if hasattr(obj, name): + raise ValueError('The object already has an attribute with the same name.') + setattr(obj, name, value) + + +def pad_sequence_with_none(sequence, target_length): + return list(sequence) + [None] * (target_length - len(sequence)) + + +def strided_downsample(input_tensor, target_size): + """Strided downsamples a tensor to the target size. + + The stride_height and stride_width is computed by (height - 1) // + (target_height - 1) and (width - 1) // (target_width - 1). We raise an error + if stride_height != stride_width, since this is not intended in our current + use cases. But this check can be removed if different strides are desired. + This function supports static shape only. + + Args: + input_tensor: A [batch, height, width] tf.Tensor to be downsampled. + target_size: A list of two integers, [target_height, target_width], the + target size after downsampling. + + Returns: + output_tensor: A [batch, target_height, target_width] tf.Tensor, the + downsampled result. + + Raises: + ValueError: If the input cannot be downsampled with integer stride, i.e., + (height - 1) % (target_height - 1) != 0, or (width - 1) % (target_width - + 1) != 0. + ValueError: If the height axis stride does not equal to the width axis + stride. + """ + input_height, input_width = input_tensor.get_shape().as_list()[1:3] + target_height, target_width = target_size + + if ((input_height - 1) % (target_height - 1) or + (input_width - 1) % (target_width - 1)): + raise ValueError('The input cannot be downsampled with integer striding. ' + 'Please ensure (height - 1) % (target_height - 1) == 0 ' + 'and (width - 1) % (target_width - 1) == 0.') + stride_height = (input_height - 1) // (target_height - 1) + stride_width = (input_width - 1) // (target_width - 1) + if stride_height != stride_width: + raise ValueError('The height axis stride does not equal to the width axis ' + 'stride.') + if stride_height > 1 or stride_width > 1: + return input_tensor[:, ::stride_height, ::stride_width] + return input_tensor + + +def get_stuff_class_ids(num_thing_stuff_classes: int, + thing_class_ids: List[int], + void_label: int) -> List[int]: + """Computes stuff_class_ids. + + The stuff_class_ids are computed from the num_thing_stuff_classes, the + thing_class_ids and the void_label. + + Args: + num_thing_stuff_classes: An integer specifying the number of stuff and thing + classes, not including `void` class. + thing_class_ids: A List of integers of length [num_thing_classes] containing + thing class indices. + void_label: An integer specifying the void label. + + Returns: + stuff_class_ids: A sorted List of integers of shape [num_stuff_classes] + containing stuff class indices. + """ + if void_label >= num_thing_stuff_classes: + thing_stuff_class_ids = list(range(num_thing_stuff_classes)) + else: + thing_stuff_class_ids = [_ for _ in range(num_thing_stuff_classes + 1) + if _ is not void_label] + return sorted(set(thing_stuff_class_ids) - set(thing_class_ids)) + + +def get_supported_tasks( + config: config_pb2.ExperimentOptions) -> Set[str]: + """Gets currently supported tasks for each meta_architecture. + + Args: + config: A config_pb2.ExperimentOptions configuration. + + Returns: + supported_tasks: A set of strings (see common.py), optionally + - common.TASK_PANOPTIC_SEGMENTATION, + - common.TASK_INSTANCE_SEGMENTATION, + - common.TASK_VIDEO_PANOPTIC_SEGMENTATION, + """ + supported_tasks = set() + meta_architecture = config.model_options.WhichOneof('meta_architecture') + is_max_deeplab = meta_architecture == 'max_deeplab' + is_motion_deeplab = meta_architecture == 'motion_deeplab' + is_panoptic_deeplab = meta_architecture == 'panoptic_deeplab' + is_vip_deeplab = meta_architecture == 'vip_deeplab' + is_panoptic = ( + (config.model_options.panoptic_deeplab.instance.enable and + is_panoptic_deeplab) or + is_motion_deeplab or is_max_deeplab or is_vip_deeplab) + if is_panoptic: + supported_tasks.add(common.TASK_PANOPTIC_SEGMENTATION) + # MaX-DeepLab does not support evaluating instance segmentation mask AP yet. + if not is_max_deeplab: + supported_tasks.add(common.TASK_INSTANCE_SEGMENTATION) + if is_motion_deeplab or is_vip_deeplab: + supported_tasks.add(common.TASK_VIDEO_PANOPTIC_SEGMENTATION) + if is_vip_deeplab: + supported_tasks.add(common.TASK_DEPTH_AWARE_VIDEO_PANOPTIC_SEGMENTATION) + return supported_tasks diff --git a/model/utils_test.py b/model/utils_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1f3848148a8d5eb447c15ae45b5d883d240b6a8f --- /dev/null +++ b/model/utils_test.py @@ -0,0 +1,201 @@ +# coding=utf-8 +# Copyright 2021 The Deeplab2 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for utils.""" + +import itertools + +import numpy as np +import tensorflow as tf + +from deeplab2.model import utils + + +class UtilsTest(tf.test.TestCase): + + def test_resize_logits_graph_mode(self): + @tf.function + def graph_mode_wrapper(*args): + return utils.resize_and_rescale_offsets(*args) + + resized_logits = graph_mode_wrapper(tf.ones((2, 33, 33, 2)), [65, 65]) + resized_logits_2 = graph_mode_wrapper(tf.ones((2, 33, 33, 2)), [33, 33]) + self.assertListEqual(resized_logits.shape.as_list(), [2, 65, 65, 2]) + self.assertListEqual(resized_logits_2.shape.as_list(), [2, 33, 33, 2]) + + def test_resize_logits(self): + offset_logits = tf.convert_to_tensor([[[[2, 2], [2, 1], [2, 0]], + [[1, 2], [1, 1], [1, 0]], + [[0, 2], [0, 1], [0, 0]]]], + dtype=tf.float32) + target_size = [5, 5] + resized_logits = utils.resize_and_rescale_offsets(offset_logits, + target_size) + + self.assertListEqual(resized_logits.shape.as_list(), [1, 5, 5, 2]) + for i in range(5): + for j in range(5): + np.testing.assert_array_almost_equal(resized_logits.numpy()[0, i, j, :], + [4 - i, 4 - j]) + + def test_zero_padding(self): + input_tensor = tf.ones(shape=(2, 5, 5, 2)) + input_tensor_2 = tf.ones(shape=(5, 5, 2)) + padded_tensor = utils.add_zero_padding(input_tensor, kernel_size=5, rank=4) + padded_tensor_2 = utils.add_zero_padding( + input_tensor_2, kernel_size=5, rank=3) + + self.assertEqual(tf.reduce_sum(padded_tensor), 100) + self.assertEqual(tf.reduce_sum(padded_tensor_2), 50) + self.assertListEqual(padded_tensor.shape.as_list(), [2, 9, 9, 2]) + self.assertListEqual(padded_tensor_2.shape.as_list(), [9, 9, 2]) + # Count zero elements. + self.assertEqual(tf.reduce_sum(padded_tensor-1), -224) + self.assertEqual(tf.reduce_sum(padded_tensor_2-1), -112) + + def test_resize_function_error(self): + input_tensor = tf.random.uniform(shape=(2, 10, 10, 2)) + with self.assertRaises(ValueError): + _ = utils.resize_align_corners(input_tensor, [19, 19], + method='not_a_valid_method') + + def test_resize_function_shape(self): + input_tensor = tf.random.uniform(shape=(2, 10, 10, 2)) + result_tensor = utils.resize_align_corners(input_tensor, [19, 19]) + + self.assertListEqual(result_tensor.shape.as_list(), [2, 19, 19, 2]) + + def test_resize_graph_mode(self): + @tf.function + def graph_mode_wrapper(*args): + return utils.resize_align_corners(*args) + + result_tensor = graph_mode_wrapper(tf.ones((2, 33, 33, 2)), [65, 65]) + result_tensor_2 = graph_mode_wrapper(tf.ones((2, 33, 33, 2)), [33, 33]) + self.assertListEqual(result_tensor.shape.as_list(), [2, 65, 65, 2]) + self.assertListEqual(result_tensor_2.shape.as_list(), [2, 33, 33, 2]) + + def test_resize_function_constant_input(self): + input_tensor = tf.ones(shape=(2, 10, 10, 2)) + result_tensor = utils.resize_align_corners(input_tensor, [19, 19]) + + self.assertTrue(tf.keras.backend.all(result_tensor == 1)) + + def test_resize_function_invalid_rank(self): + input_tensor = tf.keras.Input(shape=(None, 2)) + with self.assertRaisesRegex( + ValueError, 'should have rank of 4'): + _ = utils.resize_align_corners(input_tensor, [19, 19]) + + def test_resize_function_v1_compatibility(self): + # Test for odd and even input, and output shapes. + input_shapes = [(2, 10, 10, 3), (2, 11, 11, 3)] + target_sizes = [[19, 19], [20, 20]] + methods = ['bilinear', 'nearest'] + + for shape, target_size, method in itertools.product(input_shapes, + target_sizes, methods): + input_tensor = tf.random.uniform(shape=shape) + + result_tensor = utils.resize_align_corners(input_tensor, target_size, + method) + if method == 'bilinear': + expected_tensor = tf.compat.v1.image.resize( + input_tensor, + target_size, + align_corners=True, + method=tf.compat.v1.image.ResizeMethod.BILINEAR) + else: + expected_tensor = tf.compat.v1.image.resize( + input_tensor, + target_size, + align_corners=True, + method=tf.compat.v1.image.ResizeMethod.NEAREST_NEIGHBOR) + + np.testing.assert_equal(result_tensor.numpy(), expected_tensor.numpy()) + + def test_resize_bilinear_v1_compatibility(self): + # Test for odd and even input, and output shapes. + input_shapes = [(2, 10, 10, 3), (2, 11, 11, 3), (1, 11, 11, 64)] + target_sizes = [[19, 19], [20, 20], [10, 10]] + + for shape, target_size in itertools.product(input_shapes, target_sizes): + input_tensor = tf.random.uniform(shape=shape) + result_tensor = utils.resize_bilinear(input_tensor, target_size) + expected_tensor = tf.compat.v1.image.resize( + input_tensor, + target_size, + align_corners=True, + method=tf.compat.v1.image.ResizeMethod.BILINEAR) + self.assertAllClose(result_tensor, expected_tensor) + + def test_make_divisible(self): + value, divisor, min_value = 17, 2, 8 + new_value = utils.make_divisible(value, divisor, min_value) + self.assertAllEqual(new_value, 18) + + value, divisor, min_value = 17, 2, 22 + new_value = utils.make_divisible(value, divisor, min_value) + self.assertAllEqual(new_value, 22) + + def test_transpose_and_reshape_for_attention_operation(self): + images = tf.zeros([2, 8, 11, 2]) + output = utils.transpose_and_reshape_for_attention_operation(images) + self.assertEqual(output.get_shape().as_list(), [2, 11, 16]) + + def test_reshape_and_transpose_for_attention_operation(self): + images = tf.zeros([2, 11, 16]) + output = utils.reshape_and_transpose_for_attention_operation(images, + num_heads=8) + self.assertEqual(output.get_shape().as_list(), [2, 8, 11, 2]) + + def test_safe_setattr_raise_error(self): + layer = tf.keras.layers.Conv2D(1, 1) + with self.assertRaises(ValueError): + utils.safe_setattr(layer, 'filters', 3) + + utils.safe_setattr(layer, 'another_conv', tf.keras.layers.Conv2D(1, 1)) + with self.assertRaises(ValueError): + utils.safe_setattr(layer, 'another_conv', tf.keras.layers.Conv2D(1, 1)) + + def test_pad_sequence_with_none(self): + sequence = [1, 2] + output_2 = utils.pad_sequence_with_none(sequence, target_length=2) + self.assertEqual(output_2, [1, 2]) + output_3 = utils.pad_sequence_with_none(sequence, target_length=3) + self.assertEqual(output_3, [1, 2, None]) + + def test_strided_downsample(self): + inputs = tf.zeros([2, 11, 11]) + output = utils.strided_downsample(inputs, target_size=[6, 6]) + self.assertEqual(output.get_shape().as_list(), [2, 6, 6]) + + def test_get_stuff_class_ids(self): + # num_thing_stuff_classes does not include `void` class. + num_thing_stuff_classes = 5 + thing_class_ids = [3, 4] + void_label_list = [5, 0] + expected_stuff_class_ids_list = [ + [0, 1, 2], [1, 2, 5] + ] + for void_label, expected_stuff_class_ids in zip( + void_label_list, expected_stuff_class_ids_list): + stuff_class_ids = utils.get_stuff_class_ids( + num_thing_stuff_classes, thing_class_ids, void_label) + np.testing.assert_equal(stuff_class_ids, + expected_stuff_class_ids) + +if __name__ == '__main__': + tf.test.main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..17b9290276f31b40f51a33fd3c8e5937ac32ddb2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +matplotlib +numpy +Pillow +tensorflow +gradio diff --git a/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op.cc b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a5d1dd333627a8b6206bdfd6fa2ec902086df87a --- /dev/null +++ b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op.cc @@ -0,0 +1,86 @@ +// Copyright 2021 The Deeplab2 Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include /*third_party*/"tensorflow/core/framework/op.h" +#include /*third_party*/"tensorflow/core/framework/shape_inference.h" + +namespace tensorflow_models { +namespace deeplab { +namespace deeplab2 { + +using tensorflow::shape_inference::DimensionHandle; +using tensorflow::shape_inference::InferenceContext; +using tensorflow::shape_inference::ShapeHandle; + +REGISTER_OP("MergeSemanticAndInstanceMaps") + .Input("semantic_maps: int32") + .Input("instance_maps: int32") + .Input("thing_ids: int32") + .Attr("label_divisor: int = 256") + .Attr("stuff_area_limit: int = 0") + .Attr("void_label: int = 0") + .Output("parsing_maps: int32") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle semantic_maps; + ShapeHandle instance_maps; + ShapeHandle thing_ids; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &semantic_maps)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &instance_maps)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &thing_ids)); + DimensionHandle batch = c->Dim(semantic_maps, 0); + DimensionHandle height = c->Dim(semantic_maps, 1); + DimensionHandle width = c->Dim(semantic_maps, 2); + c->set_output(0, c->MakeShape({batch, height, width})); + return tensorflow::Status::OK(); + }) + .Doc(R"doc( +Generates parsing maps from semantic maps and instance maps. + +Parsing maps, or panoptic segmentation, are merged from the predicted semantic +maps and class-agnostic instance maps. This function merges the maps in the +following way: + +1) If a pixel belongs to `stuff` class (e.g., sky), the function directly uses + the semantic label from the semantic map and uses 0 as the instance label. +2) If a pixel belongs to `thing` class (e.g., person), it uses the instance + label from the instance map and uses the majority of the semantic labels of + the same instance as the final semantic label. +3) The function relabels each instance, so that the instance label of each + semantic class is in the range of [1, num_instances_of_the_semantic_class]. + +Note that this operation is first poposed in the DeeperLab paper and adopted +by the Panoptic-DeepLab framework. + - DeeperLab: Single-Shot Image Parser, T-J Yang, et al. arXiv:1902.05093. + - Panoptic-DeepLab, B. Cheng, et al. In CVPR, 2020. + +semantic_maps: An int32 Tensor with shape `[batch, height, width]` whose value + indicates the predicted semantic label of each pixel. +instance_maps: An int32 Tensor with shape `[batch, height, width]` whose value + indicates the predicted instance label of each pixel. +thing_ids: An int32 Tensor with shape `[num_thing_ids]` whose value refers to + the semantic ids of the thing classes. +label_divisor: An integer. The value used to combine the semantic and instance + map to generate the parsing map. In particular, the value of a pixel in the + parsing map is equal to its corresponding semantic label times label_divisor + plus instance label (i.e., semantic_label * label_divisor + instance_label). +stuff_area_limit: An integer. Predicted stuff segments whose areas are smaller + than this threshold are assigned to VOID label. +void_label: An integer, specifying the VOID label. +parsing_maps: An int32 Tensor with shape `[batch, height, width]` whose value + indicates the merged semantic and instance label of each pixel. +)doc"); + +} // namespace deeplab2 +} // namespace deeplab +} // namespace tensorflow_models diff --git a/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.cc b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..2a5071bb21e0b06a472be9efaba2f7438e6e9f35 --- /dev/null +++ b/tensorflow_ops/kernels/merge_semantic_and_instance_maps_op_kernel.cc @@ -0,0 +1,279 @@ +// Copyright 2021 The Deeplab2 Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include