Spaces:
Runtime error
Runtime error
# coding=utf-8 | |
# Copyright 2021 The Deeplab2 Authors. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Contains Axial-ResNet model instances for Axial-DeepLab and MaX-DeepLab. | |
Reference: | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", | |
CVPR 2021. https://arxiv.org/abs/2012.00759 | |
Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. | |
""" | |
import abc | |
import collections.abc | |
import copy | |
from absl import logging | |
import tensorflow as tf | |
from deeplab2.model.encoder import axial_resnet | |
def _get_default_config(): | |
"""Gets the default config for Axial-ResNets.""" | |
# The default config dictionary for an Axial-ResNet is the MaX-DeepLab-S | |
# architecture for panoptic segmentation. This default config dictionary also | |
# exactly matches the default arguments of the functions. | |
default_config = { | |
'num_blocks': [3, 4, 6, 3], | |
'backbone_layer_multiplier': 1.0, | |
'width_multiplier': 1.0, | |
'stem_width_multiplier': 1.0, | |
'output_stride': 16, | |
'classification_mode': False, | |
'backbone_type': 'resnet_beta', | |
'use_axial_beyond_stride': 16, | |
'backbone_use_transformer_beyond_stride': 32, | |
'extra_decoder_use_transformer_beyond_stride': 32, | |
'backbone_decoder_num_stacks': 0, | |
'backbone_decoder_blocks_per_stage': 1, | |
'extra_decoder_num_stacks': 0, | |
'extra_decoder_blocks_per_stage': 1, | |
'max_num_mask_slots': 128, | |
'num_mask_slots': 128, | |
'memory_channels': 256, | |
'base_transformer_expansion': 1.0, | |
'global_feed_forward_network_channels': 256, | |
'high_resolution_output_stride': 4, | |
'activation': 'relu', | |
'block_group_config': { | |
'attention_bottleneck_expansion': 2, | |
'drop_path_keep_prob': 0.8, | |
'drop_path_beyond_stride': 16, | |
'drop_path_schedule': 'constant', | |
'positional_encoding_type': None, | |
'use_global_beyond_stride': 0, | |
'use_sac_beyond_stride': 0, | |
'use_squeeze_and_excite': False, | |
'conv_use_recompute_grad': False, | |
'axial_use_recompute_grad': True, | |
'recompute_within_stride': 0, | |
'transformer_use_recompute_grad': False, | |
'axial_layer_config': { | |
'query_shape': (129, 129), | |
'key_expansion': 1, | |
'value_expansion': 2, | |
'memory_flange': (32, 32), | |
'double_global_attention': False, | |
'num_heads': 8, | |
'use_query_rpe_similarity': True, | |
'use_key_rpe_similarity': True, | |
'use_content_similarity': True, | |
'retrieve_value_rpe': True, | |
'retrieve_value_content': True, | |
'initialization_std_for_query_key_rpe': 1.0, | |
'initialization_std_for_value_rpe': 1.0, | |
'self_attention_activation': 'softmax', | |
}, | |
'dual_path_transformer_layer_config': { | |
'num_heads': 8, | |
'bottleneck_expansion': 2, | |
'key_expansion': 1, | |
'value_expansion': 2, | |
'feed_forward_network_channels': 2048, | |
'use_memory_self_attention': True, | |
'use_pixel2memory_feedback_attention': True, | |
'transformer_activation': 'softmax', | |
}, | |
}, | |
'bn_layer': tf.keras.layers.BatchNormalization, | |
'conv_kernel_weight_decay': 0.0, | |
} | |
return default_config | |
def override(config_dict, override_dict): | |
"""Recursively overrides a config dict with another.""" | |
output_dict = copy.deepcopy(config_dict) | |
for key, value in override_dict.items(): | |
if isinstance(value, collections.abc.Mapping): | |
output_dict[key] = override(config_dict.get(key, {}), value) | |
else: | |
output_dict[key] = value | |
return output_dict | |
class AxialResNetInstance(axial_resnet.AxialResNet): | |
"""A base Axial-ResNet model.""" | |
def _get_config(cls): | |
pass | |
def __init__(self, name, **kwargs): | |
"""Builds an Axial-ResNet model.""" | |
# Get the config of the current model. | |
current_config = self._get_config() | |
# Override the default config with the current config. This line can be | |
# omitted because the default config equals the default arguments of the | |
# functions that build the model. But we make all the configs explicit here. | |
current_config = override(_get_default_config(), current_config) | |
# Finally, override the current model config with keyword arguments. In this | |
# way, we still respect arguments passed as keyword arguments, such as | |
# classification_mode, output_stride, etc. | |
current_config = override(current_config, kwargs) | |
logging.info('Axial-ResNet final config: %s', current_config) | |
super(AxialResNetInstance, self).__init__(name, **current_config) | |
class MaXDeepLabS(AxialResNetInstance): | |
"""MaX-DeepLab-S for panoptic segmentation. | |
Reference: | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", | |
CVPR 2021. https://arxiv.org/abs/2012.00759 | |
Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. | |
""" | |
def _get_config(cls): | |
# Return an empty dictionary as the default values are all set for | |
# MaX-DeepLab-S. | |
return {} | |
class MaXDeepLabL(AxialResNetInstance): | |
"""MaX-DeepLab-L for panoptic segmentation. | |
Reference: | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", | |
CVPR 2021. https://arxiv.org/abs/2012.00759 | |
Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. | |
""" | |
def _get_config(cls): | |
return { | |
'num_blocks': [3, 6, 3, 3], | |
'backbone_type': 'wider_resnet', | |
'backbone_use_transformer_beyond_stride': 16, | |
'extra_decoder_use_transformer_beyond_stride': 16, | |
'backbone_decoder_num_stacks': 1, | |
'extra_decoder_num_stacks': 1, | |
'extra_decoder_blocks_per_stage': 3, | |
'memory_channels': 512, | |
'base_transformer_expansion': 2.0, | |
'global_feed_forward_network_channels': 512, | |
'block_group_config': { | |
'attention_bottleneck_expansion': 4, | |
'drop_path_beyond_stride': 4, | |
'axial_layer_config': { | |
'key_expansion': 2, | |
'value_expansion': 4, | |
}, | |
}, | |
} | |
class MaXDeepLabSBackbone(MaXDeepLabS): | |
"""MaX-DeepLab-S backbone for image classification pretraining. | |
Reference: | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", | |
CVPR 2021. https://arxiv.org/abs/2012.00759 | |
Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. | |
""" | |
def _get_config(cls): | |
base_config = super(MaXDeepLabSBackbone, cls)._get_config() | |
# Override the config of MaXDeepLabS. | |
override_config = { | |
'classification_mode': True, | |
# The transformer blocks are not ImageNet pretrained. They are randomly | |
# initialized and trained from scratch for panoptic segmentation. | |
'backbone_use_transformer_beyond_stride': 0, | |
} | |
return override(base_config, override_config) | |
class MaXDeepLabLBackbone(MaXDeepLabL): | |
"""MaX-DeepLab-L backbone for image classification pretraining. | |
Reference: | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
MaX-DeepLab: "End-to-End Panoptic Segmentation with Mask Transformers", | |
CVPR 2021. https://arxiv.org/abs/2012.00759 | |
Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen. | |
""" | |
def _get_config(cls): | |
base_config = super(MaXDeepLabLBackbone, cls)._get_config() | |
# Override the config of MaXDeepLabL. | |
override_config = { | |
'classification_mode': True, | |
# The transformer blocks are not ImageNet pretrained. They are randomly | |
# initialized and trained from scratch for panoptic segmentation. | |
'backbone_use_transformer_beyond_stride': 0, | |
} | |
return override(base_config, override_config) | |
class ResNet50(AxialResNetInstance): | |
"""A ResNet-50 instance. | |
Note that the implementation is different from the original ResNet-50 in: | |
(1) We apply strided convolutions in the first 3x3 convolution of the first | |
residual block of a stage. | |
(2) We replace the strided max pooling layer in the stem by applying strided | |
convolution in the immediate next residual block. | |
""" | |
def _get_config(cls): | |
return { | |
'classification_mode': True, | |
'backbone_type': 'resnet', | |
'use_axial_beyond_stride': 0, | |
'backbone_use_transformer_beyond_stride': 0, | |
'block_group_config': { | |
'drop_path_keep_prob': 1.0, | |
}, | |
} | |
class ResNet50Beta(ResNet50): | |
"""A ResNet-50 but with inception stem. | |
Note that the implementation is different from the original ResNet-50 in: | |
(1) We apply strided convolutions in the first 3x3 convolution of the first | |
residual block of a stage. | |
(2) We replace the strided max pooling layer in the stem by applying strided | |
convolution in the immediate next residual block. | |
""" | |
def _get_config(cls): | |
base_config = super(ResNet50Beta, cls)._get_config() | |
# Override the config of ResNet50. | |
override_config = { | |
'backbone_type': 'resnet_beta', | |
} | |
return override(base_config, override_config) | |
class AxialResNetL(ResNet50): | |
"""Axial-ResNet-L for image classification only. | |
Axial-ResNet-L is a ResNet50 with use_axial_beyond_stride = 2. | |
Reference: | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
""" | |
def _get_config(cls): | |
base_config = super(AxialResNetL, cls)._get_config() | |
# Override the config of ResNet50. | |
override_config = { | |
'use_axial_beyond_stride': 2, | |
} | |
return override(base_config, override_config) | |
class AxialResNetS(ResNet50): | |
"""Axial-ResNet-S for image classification only. | |
Axial-ResNet-S is a ResNet50 with use_axial_beyond_stride = 2 and | |
width_multiplier = 0.5. | |
Reference: | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
""" | |
def _get_config(cls): | |
base_config = super(AxialResNetS, cls)._get_config() | |
# Override the config of ResNet50. | |
override_config = { | |
'width_multiplier': 0.5, | |
'use_axial_beyond_stride': 2, | |
} | |
return override(base_config, override_config) | |
class AxialDeepLabL(ResNet50Beta): | |
"""Axial-DeepLab-L for panoptic segmentation. | |
Axial-DeepLab-L is a ResNet50Beta with use_axial_beyond_stride = 2. | |
Axial-DeepLab-L is also equivalent to Axial-ResNet-L with an inception stem. | |
Reference: | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
""" | |
def _get_config(cls): | |
base_config = super(AxialDeepLabL, cls)._get_config() | |
override_config = { | |
'use_axial_beyond_stride': 2, | |
} | |
return override(base_config, override_config) | |
class AxialDeepLabS(ResNet50Beta): | |
"""Axial-DeepLab-S for panoptic segmentation. | |
Axial-DeepLab-S is a ResNet50Beta with use_axial_beyond_stride = 2 and | |
width_multiplier = 0.5. | |
Axial-DeepLab-S is also equivalent to Axial-ResNet-S with an inception stem. | |
Reference: | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
""" | |
def _get_config(cls): | |
base_config = super(AxialDeepLabS, cls)._get_config() | |
override_config = { | |
'width_multiplier': 0.5, | |
'use_axial_beyond_stride': 2, | |
} | |
return override(base_config, override_config) | |
class SWideRNet(AxialResNetInstance): | |
"""A SWideRNet instance. | |
Note that the implementation is different from the original SWideRNet in: | |
(1) We apply strided convolutions in the first residual block of a stage, | |
instead of the last residual block. | |
(2) We replace the strided max pooling layer in the stem by applying strided | |
convolution in the immediate next residual block. | |
(3) We (optionally) use squeeze and excitation in all five stages, instead | |
of the last four stages only. | |
Reference: | |
Scaling Wide Residual Networks for Panoptic Segmentation, | |
https://arxiv.org/abs/2011.11675 | |
Liang-Chieh Chen, Huiyu Wang, Siyuan Qiao. | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
""" | |
def _get_config(cls): | |
return { | |
'num_blocks': [3, 6, 3, 3], | |
'classification_mode': True, | |
'backbone_type': 'wider_resnet', | |
'use_axial_beyond_stride': 0, | |
'backbone_use_transformer_beyond_stride': 0, | |
'block_group_config': { | |
'drop_path_beyond_stride': 4, | |
'conv_use_recompute_grad': True, | |
}, | |
} | |
class AxialSWideRNet(SWideRNet): | |
"""SWideRNet with axial attention blocks in the last two stages. | |
Note that the implementation is different from the original SWideRNet in: | |
(1) We apply strided convolutions in the first residual block of a stage, | |
instead of the last residual block. | |
(2) We replace the strided max pooling layer in the stem by applying strided | |
convolution in the immediate next residual block. | |
(3) We (optionally) use squeeze and excitation in all five stages, instead | |
of the last four stages only. | |
Reference: | |
Scaling Wide Residual Networks for Panoptic Segmentation, | |
https://arxiv.org/abs/2011.11675 | |
Liang-Chieh Chen, Huiyu Wang, Siyuan Qiao. | |
Axial-Deeplab: Stand-Alone Axial-Attention for Panoptic Segmentation, | |
ECCV 2020 Spotlight. https://arxiv.org/abs/2003.07853 | |
Huiyu Wang, Yukun Zhu, Bradley Green, Hartwig Adam, Alan Yuille, | |
Liang-Chieh Chen. | |
""" | |
def _get_config(cls): | |
base_config = super(AxialSWideRNet, cls)._get_config() | |
override_config = { | |
'use_axial_beyond_stride': 16, | |
'block_group_config': { | |
'attention_bottleneck_expansion': 4, | |
'axial_layer_config': { | |
'key_expansion': 2, | |
'value_expansion': 4, | |
}, | |
}, | |
} | |
return override(base_config, override_config) | |
def get_model(name, **kwargs): | |
"""Gets the model instance given the model name.""" | |
name_lower = name.lower() | |
if name_lower == 'max_deeplab_s': | |
return MaXDeepLabS(name_lower, **kwargs) | |
elif name_lower == 'max_deeplab_l': | |
return MaXDeepLabL(name_lower, **kwargs) | |
elif name_lower == 'max_deeplab_s_backbone': | |
return MaXDeepLabSBackbone(name_lower, **kwargs) | |
elif name_lower == 'max_deeplab_l_backbone': | |
return MaXDeepLabLBackbone(name_lower, **kwargs) | |
elif name_lower == 'resnet50': | |
return ResNet50(name_lower, **kwargs) | |
elif name_lower == 'resnet50_beta': | |
return ResNet50Beta(name_lower, **kwargs) | |
elif name_lower == 'swidernet' or name_lower == 'wide_resnet41': | |
return SWideRNet(name_lower, **kwargs) | |
elif name_lower == 'axial_swidernet': | |
return AxialSWideRNet(name_lower, **kwargs) | |
elif name_lower == 'axial_resnet_s': | |
return AxialResNetS(name_lower, **kwargs) | |
elif name_lower == 'axial_resnet_l': | |
return AxialResNetL(name_lower, **kwargs) | |
elif name_lower == 'axial_deeplab_s': | |
return AxialDeepLabS(name_lower, **kwargs) | |
elif name_lower == 'axial_deeplab_l': | |
return AxialDeepLabL(name_lower, **kwargs) | |
else: | |
raise ValueError(name_lower + ' is not supported.') | |