Spaces:
Runtime error
Runtime error
# coding=utf-8 | |
# Copyright 2021 The Deeplab2 Authors. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""MobileNetV3 models for Deep Labeling. | |
Reference: | |
Howard, A., Sandler, M., et al. Searching for mobilenetv3. In ICCV, 2019 | |
""" | |
from typing import Any, Callable, Mapping, Optional, Sequence | |
import tensorflow as tf | |
from deeplab2.model import utils | |
from deeplab2.model.layers import blocks | |
from deeplab2.model.layers import convolutions | |
# The default input image channels. | |
_INPUT_CHANNELS = 3 | |
MNV3Small_BLOCK_SPECS = { | |
'spec_name': 'MobileNetV3Small', | |
'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', | |
'activation', 'se_ratio', 'expand_ratio', | |
'is_endpoint'], | |
'block_specs': [ | |
('conv_bn', 3, 2, 16, | |
'hard_swish', None, None, True), | |
('inverted_bottleneck', 3, 2, 16, | |
'relu', 0.25, 1, True), | |
('inverted_bottleneck', 3, 2, 24, | |
'relu', None, 72. / 16, False), | |
('inverted_bottleneck', 3, 1, 24, | |
'relu', None, 88. / 24, True), | |
('inverted_bottleneck', 5, 2, 40, | |
'hard_swish', 0.25, 4., False), | |
('inverted_bottleneck', 5, 1, 40, | |
'hard_swish', 0.25, 6., False), | |
('inverted_bottleneck', 5, 1, 40, | |
'hard_swish', 0.25, 6., False), | |
('inverted_bottleneck', 5, 1, 48, | |
'hard_swish', 0.25, 3., False), | |
('inverted_bottleneck', 5, 1, 48, | |
'hard_swish', 0.25, 3., True), | |
('inverted_bottleneck', 5, 2, 96, | |
'hard_swish', 0.25, 6., False), | |
('inverted_bottleneck', 5, 1, 96, | |
'hard_swish', 0.25, 6., False), | |
('inverted_bottleneck', 5, 1, 96, | |
'hard_swish', 0.25, 6., False), | |
('conv_bn', 1, 1, 576, | |
'hard_swish', None, None, True), | |
] | |
} | |
MNV3Large_BLOCK_SPECS = { | |
'spec_name': 'MobileNetV3Large', | |
'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', | |
'activation', 'se_ratio', 'expand_ratio', | |
'is_endpoint'], | |
'block_specs': [ | |
('conv_bn', 3, 2, 16, | |
'hard_swish', None, None, False), | |
('inverted_bottleneck', 3, 1, 16, | |
'relu', None, 1., True), | |
('inverted_bottleneck', 3, 2, 24, | |
'relu', None, 4., False), | |
('inverted_bottleneck', 3, 1, 24, | |
'relu', None, 3., True), | |
('inverted_bottleneck', 5, 2, 40, | |
'relu', 0.25, 3., False), | |
('inverted_bottleneck', 5, 1, 40, | |
'relu', 0.25, 3., False), | |
('inverted_bottleneck', 5, 1, 40, | |
'relu', 0.25, 3., True), | |
('inverted_bottleneck', 3, 2, 80, | |
'hard_swish', None, 6., False), | |
('inverted_bottleneck', 3, 1, 80, | |
'hard_swish', None, 2.5, False), | |
('inverted_bottleneck', 3, 1, 80, | |
'hard_swish', None, 2.3, False), | |
('inverted_bottleneck', 3, 1, 80, | |
'hard_swish', None, 2.3, False), | |
('inverted_bottleneck', 3, 1, 112, | |
'hard_swish', 0.25, 6., False), | |
('inverted_bottleneck', 3, 1, 112, | |
'hard_swish', 0.25, 6., True), | |
('inverted_bottleneck', 5, 2, 160, | |
'hard_swish', 0.25, 6., False), | |
('inverted_bottleneck', 5, 1, 160, | |
'hard_swish', 0.25, 6., False), | |
('inverted_bottleneck', 5, 1, 160, | |
'hard_swish', 0.25, 6., False), | |
('conv_bn', 1, 1, 960, | |
'hard_swish', None, None, True), | |
] | |
} | |
SUPPORTED_SPECS_MAP = { | |
'MobileNetV3Large': MNV3Large_BLOCK_SPECS, | |
'MobileNetV3Small': MNV3Small_BLOCK_SPECS, | |
} | |
# pylint: disable=invalid-name | |
def _block_spec_decoder(specs: Mapping[Any, Any], | |
width_multiplier: float, | |
divisible_by: int = 8) -> Sequence[Mapping[str, Any]]: | |
"""Decodes specs for a block. | |
Args: | |
specs: A `dict` specification of block specs of a mobilenet version. | |
width_multiplier: A `float` multiplier for the filter size for all | |
convolution ops. The value must be greater than zero. Typical usage will | |
be to set this value in (0, 1) to reduce the number of parameters or | |
computation cost of the model. | |
divisible_by: An `int` that ensures all inner dimensions are divisible by | |
this number. | |
Returns: | |
A list of block spec in dictionary that defines structure of the layers. | |
""" | |
spec_name = specs['spec_name'] | |
block_spec_schema = specs['block_spec_schema'] | |
block_specs = specs['block_specs'] | |
if not block_specs: | |
raise ValueError( | |
'The block spec cannot be empty for {} !'.format(spec_name)) | |
if len(block_specs[0]) != len(block_spec_schema): | |
raise ValueError('The block spec values {} do not match with ' | |
'the schema {}'.format(block_specs[0], block_spec_schema)) | |
decoded_specs = [] | |
for spec in block_specs: | |
spec_dict = dict(zip(block_spec_schema, spec)) | |
decoded_specs.append(spec_dict) | |
for ds in decoded_specs: | |
ds['filters'] = utils.make_divisible( | |
value=ds['filters'] * width_multiplier, | |
divisor=divisible_by, | |
min_value=8) | |
return decoded_specs | |
# pylint: enable=invalid-name | |
class MobileNet(tf.keras.Model): | |
"""Creates a MobileNetV3 family model.""" | |
def __init__( | |
self, | |
model_id: str = 'MobileNetV3Small', | |
width_multiplier: float = 1.0, | |
output_stride: Optional[int] = None, | |
min_width: int = 8, | |
divisible_by: int = 8, | |
regularize_depthwise: bool = False, | |
bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization, | |
conv_kernel_weight_decay: float = 0.0, | |
name: str = 'MobilenNetV3'): | |
"""Initializes a MobileNet V3 model. | |
Args: | |
model_id: A `str` of MobileNet version. The supported values are | |
`MobileNetV3Large`, `MobileNetV3Small`. | |
width_multiplier: A `float` of multiplier for the filters (number of | |
channels) for all convolution ops. The value must be greater than zero. | |
Typical usage will be to set this value in (0, 1) to reduce the number | |
of parameters or computation cost of the model. | |
output_stride: An `int` that specifies the requested ratio of input to | |
output spatial resolution. If not None, then we invoke atrous | |
convolution if necessary to prevent the network from reducing the | |
spatial resolution of activation maps. The output_stride should be | |
divisible by 4. | |
min_width: An `int` of minimum width (number of channels) for all | |
convolution ops. Enforced when width_multiplier < 1, and not an active | |
constraint when width_multiplier >= 1. | |
divisible_by: An `int` that ensures all intermediate feature dimensions | |
are divisible by this number. | |
regularize_depthwise: If True, apply regularization on depthwise conv. | |
bn_layer: An optional tf.keras.layers.Layer that computes the | |
normalization (default: tf.keras.layers.BatchNormalization). | |
conv_kernel_weight_decay: A float, the weight decay for convolution | |
kernels. | |
name: Model name. | |
Raises: | |
ValueError: The MobileNet version is not supported. | |
ValueError: width_multiplier is not greater than zero. | |
ValueError: Output stride must be None or a multiple of 4. | |
ValueError: Unknown block type i for layer j. | |
""" | |
if model_id not in SUPPORTED_SPECS_MAP: | |
raise ValueError('The MobileNet version {} ' | |
'is not supported'.format(model_id)) | |
if width_multiplier <= 0: | |
raise ValueError('width_multiplier is not greater than zero.') | |
if (output_stride is not None and | |
(output_stride <= 1 or (output_stride > 1 and output_stride % 4))): | |
raise ValueError('Output stride must be None or a multiple of 4.') | |
super().__init__(name=name) | |
self._model_id = model_id | |
self._width_multiplier = width_multiplier | |
self._min_width = min_width | |
self._output_stride = output_stride | |
self._divisible_by = divisible_by | |
self._regularize_depthwise = regularize_depthwise | |
self._bn_layer = bn_layer | |
self._conv_kernel_weight_decay = conv_kernel_weight_decay | |
self._blocks = [] | |
self._endpoint_names = [] | |
block_specs = SUPPORTED_SPECS_MAP.get(model_id) | |
self._decoded_specs = _block_spec_decoder( | |
specs=block_specs, | |
width_multiplier=self._width_multiplier, | |
divisible_by=self._divisible_by) | |
self._mobilenet_base() | |
def _mobilenet_base(self): | |
"""Builds the base MobileNet architecture.""" | |
# The current_stride variable keeps track of the output stride of the | |
# activations, i.e., the running product of convolution strides up to the | |
# current network layer. This allows us to invoke atrous convolution | |
# whenever applying the next convolution would result in the activations | |
# having output stride larger than the target output_stride. | |
current_stride = 1 | |
# The atrous convolution rate parameter. | |
rate = 1 | |
endpoint_level = 1 | |
in_filters = _INPUT_CHANNELS | |
for i, block_def in enumerate(self._decoded_specs): | |
# We only need to build up to 'res5' endpoint for segmentation task. | |
if endpoint_level > 5 and not self._classification_mode: | |
break | |
block_name = '{}_{}'.format(block_def['block_fn'], i + 1) | |
if (self._output_stride is not None and | |
current_stride == self._output_stride): | |
# If we have reached the target output_stride, then we need to employ | |
# atrous convolution with stride=1 and multiply the atrous rate by the | |
# current unit's stride for use in subsequent layers. | |
layer_stride = 1 | |
layer_rate = rate | |
rate = ( | |
rate * block_def['strides'] | |
if block_def['strides'] is not None else rate) | |
else: | |
layer_stride = block_def['strides'] | |
layer_rate = 1 | |
current_stride = ( | |
current_stride * block_def['strides'] | |
if block_def['strides'] is not None else current_stride) | |
if block_def['block_fn'] == 'conv_bn': | |
self._blocks.append( | |
convolutions.Conv2DSame( | |
output_channels=block_def['filters'], | |
kernel_size=block_def['kernel_size'], | |
strides=layer_stride, | |
atrous_rate=layer_rate, | |
activation=block_def['activation'], | |
use_bias=False, | |
bn_layer=self._bn_layer, | |
use_bn=True, | |
conv_kernel_weight_decay=self._conv_kernel_weight_decay, | |
name=block_name, | |
)) | |
elif block_def['block_fn'] == 'inverted_bottleneck': | |
atrous_rate = 1 | |
# There is no need to apply atrous convolution to any 1x1 convolution. | |
if layer_rate > 1 and block_def['kernel_size'] != 1: | |
atrous_rate = layer_rate | |
self._blocks.append( | |
blocks.InvertedBottleneckBlock( | |
in_filters=in_filters, | |
out_filters=block_def['filters'], | |
expand_ratio=block_def['expand_ratio'], | |
strides=layer_stride, | |
kernel_size=block_def['kernel_size'], | |
se_ratio=block_def['se_ratio'], | |
activation=block_def['activation'], | |
expand_se_in_filters=True, | |
depthwise_activation=None, | |
atrous_rate=atrous_rate, | |
divisible_by=self._divisible_by, | |
regularize_depthwise=self._regularize_depthwise, | |
use_depthwise=True, | |
# Note that whether the residual connection would be used is | |
# also conditional on the in_filters and out_filters size, even | |
# if use_residual=True,e.g. when input_filters != out_filters, | |
# no residual connection will be created. | |
use_residual=(block_def['strides'] == 1), | |
bn_layer=self._bn_layer, | |
conv_kernel_weight_decay=self._conv_kernel_weight_decay, | |
name=block_name, | |
)) | |
else: | |
raise ValueError('Unknown block type {} for layer {}'.format( | |
block_def['block_fn'], i)) | |
# Register input_filters for the next level | |
in_filters = block_def['filters'] | |
if block_def['is_endpoint']: | |
# Name the endpoint to be 'res{1...5}' to align with ResNet. This | |
# simplifies segmentation head implementation. | |
self._endpoint_names.append('res' + str(endpoint_level)) | |
endpoint_level += 1 | |
else: | |
self._endpoint_names.append(None) | |
def call(self, input_tensor: tf.Tensor, training: bool = False): | |
"""Performs a forward pass through MobileNet.""" | |
net = input_tensor | |
endpoints = {} | |
for block, endpoint_name in zip(self._blocks, self._endpoint_names): | |
net = block(net, training=training) | |
if endpoint_name is not None: | |
endpoints[endpoint_name] = net | |
return endpoints | |
def MobileNetV3Small( | |
width_multiplier: float = 1.0, | |
output_stride: int = 32, | |
bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization, | |
conv_kernel_weight_decay: float = 0.0, | |
name: str = 'MobileNetV3Small') -> tf.keras.Model: | |
"""Creates a MobileNetV3Small model. | |
Args: | |
width_multiplier: A float, depth_multiplier for the whole model. | |
output_stride: An optional integer specifying the output stride of the | |
network. | |
bn_layer: An optional tf.keras.layers.Layer that computes the | |
normalization (default: tf.keras.layers.BatchNormalization). | |
conv_kernel_weight_decay: A float, the weight decay for convolution kernels. | |
name: Model name. | |
Returns: | |
The MobileNetV3Small model as an instance of tf.keras.Model. | |
""" | |
model = MobileNet(model_id='MobileNetV3Small', | |
width_multiplier=width_multiplier, | |
output_stride=output_stride, | |
bn_layer=bn_layer, | |
conv_kernel_weight_decay=conv_kernel_weight_decay, | |
name=name) | |
return model | |
def MobileNetV3Large( | |
width_multiplier: float = 1.0, | |
output_stride: int = 32, | |
bn_layer: Callable[..., Any] = tf.keras.layers.BatchNormalization, | |
conv_kernel_weight_decay: float = 0.0, | |
name: str = 'MobileNetV3Large') -> tf.keras.Model: | |
"""Creates a MobileNetV3Large model. | |
Args: | |
width_multiplier: A float, depth_multiplier for the STEM. | |
output_stride: An optional integer specifying the output stride of the | |
network. | |
bn_layer: An optional tf.keras.layers.Layer that computes the | |
normalization (default: tf.keras.layers.BatchNormalization). | |
conv_kernel_weight_decay: A float, the weight decay for convolution kernels. | |
name: Model name. | |
Returns: | |
The MobileNetV3Large model as an instance of tf.keras.Model. | |
""" | |
model = MobileNet(model_id='MobileNetV3Large', | |
width_multiplier=width_multiplier, | |
output_stride=output_stride, | |
bn_layer=bn_layer, | |
conv_kernel_weight_decay=conv_kernel_weight_decay, | |
name=name) | |
return model | |