512 lines
19 KiB
Python
512 lines
19 KiB
Python
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ==============================================================================
|
|
# pylint: disable=invalid-name
|
|
"""MobileNet v2 models for Keras.
|
|
|
|
MobileNetV2 is a general architecture and can be used for multiple use cases.
|
|
Depending on the use case, it can use different input layer size and
|
|
different width factors. This allows different width models to reduce
|
|
the number of multiply-adds and thereby
|
|
reduce inference cost on mobile devices.
|
|
|
|
MobileNetV2 is very similar to the original MobileNet,
|
|
except that it uses inverted residual blocks with
|
|
bottlenecking features. It has a drastically lower
|
|
parameter count than the original MobileNet.
|
|
MobileNets support any input size greater
|
|
than 32 x 32, with larger image sizes
|
|
offering better performance.
|
|
|
|
The number of parameters and number of multiply-adds
|
|
can be modified by using the `alpha` parameter,
|
|
which increases/decreases the number of filters in each layer.
|
|
By altering the image size and `alpha` parameter,
|
|
all 22 models from the paper can be built, with ImageNet weights provided.
|
|
|
|
The paper demonstrates the performance of MobileNets using `alpha` values of
|
|
1.0 (also called 100 % MobileNet), 0.35, 0.5, 0.75, 1.0, 1.3, and 1.4
|
|
For each of these `alpha` values, weights for 5 different input image sizes
|
|
are provided (224, 192, 160, 128, and 96).
|
|
|
|
The following table describes the performance of
|
|
MobileNet on various input sizes:
|
|
------------------------------------------------------------------------
|
|
MACs stands for Multiply Adds
|
|
Classification Checkpoint|MACs (M)|Parameters (M)|Top 1 Accuracy|Top 5 Accuracy
|
|
--------------------------|------------|---------------|---------|----|---------
|
|
| [mobilenet_v2_1.4_224] | 582 | 6.06 | 75.0 | 92.5 |
|
|
| [mobilenet_v2_1.3_224] | 509 | 5.34 | 74.4 | 92.1 |
|
|
| [mobilenet_v2_1.0_224] | 300 | 3.47 | 71.8 | 91.0 |
|
|
| [mobilenet_v2_1.0_192] | 221 | 3.47 | 70.7 | 90.1 |
|
|
| [mobilenet_v2_1.0_160] | 154 | 3.47 | 68.8 | 89.0 |
|
|
| [mobilenet_v2_1.0_128] | 99 | 3.47 | 65.3 | 86.9 |
|
|
| [mobilenet_v2_1.0_96] | 56 | 3.47 | 60.3 | 83.2 |
|
|
| [mobilenet_v2_0.75_224] | 209 | 2.61 | 69.8 | 89.6 |
|
|
| [mobilenet_v2_0.75_192] | 153 | 2.61 | 68.7 | 88.9 |
|
|
| [mobilenet_v2_0.75_160] | 107 | 2.61 | 66.4 | 87.3 |
|
|
| [mobilenet_v2_0.75_128] | 69 | 2.61 | 63.2 | 85.3 |
|
|
| [mobilenet_v2_0.75_96] | 39 | 2.61 | 58.8 | 81.6 |
|
|
| [mobilenet_v2_0.5_224] | 97 | 1.95 | 65.4 | 86.4 |
|
|
| [mobilenet_v2_0.5_192] | 71 | 1.95 | 63.9 | 85.4 |
|
|
| [mobilenet_v2_0.5_160] | 50 | 1.95 | 61.0 | 83.2 |
|
|
| [mobilenet_v2_0.5_128] | 32 | 1.95 | 57.7 | 80.8 |
|
|
| [mobilenet_v2_0.5_96] | 18 | 1.95 | 51.2 | 75.8 |
|
|
| [mobilenet_v2_0.35_224] | 59 | 1.66 | 60.3 | 82.9 |
|
|
| [mobilenet_v2_0.35_192] | 43 | 1.66 | 58.2 | 81.2 |
|
|
| [mobilenet_v2_0.35_160] | 30 | 1.66 | 55.7 | 79.1 |
|
|
| [mobilenet_v2_0.35_128] | 20 | 1.66 | 50.8 | 75.0 |
|
|
| [mobilenet_v2_0.35_96] | 11 | 1.66 | 45.5 | 70.4 |
|
|
|
|
Reference:
|
|
- [MobileNetV2: Inverted Residuals and Linear Bottlenecks](
|
|
https://arxiv.org/abs/1801.04381) (CVPR 2018)
|
|
"""
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
from tensorflow.python.keras import backend
|
|
from tensorflow.python.keras.applications import imagenet_utils
|
|
from tensorflow.python.keras.engine import training
|
|
from tensorflow.python.keras.layers import VersionAwareLayers
|
|
from tensorflow.python.keras.utils import data_utils
|
|
from tensorflow.python.keras.utils import layer_utils
|
|
from tensorflow.python.lib.io import file_io
|
|
from tensorflow.python.platform import tf_logging as logging
|
|
from tensorflow.python.util.tf_export import keras_export
|
|
|
|
BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/'
|
|
'keras-applications/mobilenet_v2/')
|
|
layers = None
|
|
|
|
|
|
@keras_export('keras.applications.mobilenet_v2.MobileNetV2',
|
|
'keras.applications.MobileNetV2')
|
|
def MobileNetV2(input_shape=None,
|
|
alpha=1.0,
|
|
include_top=True,
|
|
weights='imagenet',
|
|
input_tensor=None,
|
|
pooling=None,
|
|
classes=1000,
|
|
classifier_activation='softmax',
|
|
**kwargs):
|
|
"""Instantiates the MobileNetV2 architecture.
|
|
|
|
Reference:
|
|
- [MobileNetV2: Inverted Residuals and Linear Bottlenecks](
|
|
https://arxiv.org/abs/1801.04381) (CVPR 2018)
|
|
|
|
Optionally loads weights pre-trained on ImageNet.
|
|
|
|
Note: each Keras Application expects a specific kind of input preprocessing.
|
|
For MobileNetV2, call `tf.keras.applications.mobilenet_v2.preprocess_input`
|
|
on your inputs before passing them to the model.
|
|
|
|
Arguments:
|
|
input_shape: Optional shape tuple, to be specified if you would
|
|
like to use a model with an input image resolution that is not
|
|
(224, 224, 3).
|
|
It should have exactly 3 inputs channels (224, 224, 3).
|
|
You can also omit this option if you would like
|
|
to infer input_shape from an input_tensor.
|
|
If you choose to include both input_tensor and input_shape then
|
|
input_shape will be used if they match, if the shapes
|
|
do not match then we will throw an error.
|
|
E.g. `(160, 160, 3)` would be one valid value.
|
|
alpha: Float between 0 and 1. controls the width of the network.
|
|
This is known as the width multiplier in the MobileNetV2 paper,
|
|
but the name is kept for consistency with `applications.MobileNetV1`
|
|
model in Keras.
|
|
- If `alpha` < 1.0, proportionally decreases the number
|
|
of filters in each layer.
|
|
- If `alpha` > 1.0, proportionally increases the number
|
|
of filters in each layer.
|
|
- If `alpha` = 1, default number of filters from the paper
|
|
are used at each layer.
|
|
include_top: Boolean, whether to include the fully-connected
|
|
layer at the top of the network. Defaults to `True`.
|
|
weights: String, one of `None` (random initialization),
|
|
'imagenet' (pre-training on ImageNet),
|
|
or the path to the weights file to be loaded.
|
|
input_tensor: Optional Keras tensor (i.e. output of
|
|
`layers.Input()`)
|
|
to use as image input for the model.
|
|
pooling: String, optional pooling mode for feature extraction
|
|
when `include_top` is `False`.
|
|
- `None` means that the output of the model
|
|
will be the 4D tensor output of the
|
|
last convolutional block.
|
|
- `avg` means that global average pooling
|
|
will be applied to the output of the
|
|
last convolutional block, and thus
|
|
the output of the model will be a
|
|
2D tensor.
|
|
- `max` means that global max pooling will
|
|
be applied.
|
|
classes: Integer, optional number of classes to classify images
|
|
into, only to be specified if `include_top` is True, and
|
|
if no `weights` argument is specified.
|
|
classifier_activation: A `str` or callable. The activation function to use
|
|
on the "top" layer. Ignored unless `include_top=True`. Set
|
|
`classifier_activation=None` to return the logits of the "top" layer.
|
|
**kwargs: For backwards compatibility only.
|
|
|
|
Returns:
|
|
A `keras.Model` instance.
|
|
|
|
Raises:
|
|
ValueError: in case of invalid argument for `weights`,
|
|
or invalid input shape or invalid alpha, rows when
|
|
weights='imagenet'
|
|
ValueError: if `classifier_activation` is not `softmax` or `None` when
|
|
using a pretrained top layer.
|
|
"""
|
|
global layers
|
|
if 'layers' in kwargs:
|
|
layers = kwargs.pop('layers')
|
|
else:
|
|
layers = VersionAwareLayers()
|
|
if kwargs:
|
|
raise ValueError('Unknown argument(s): %s' % (kwargs,))
|
|
if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
|
|
raise ValueError('The `weights` argument should be either '
|
|
'`None` (random initialization), `imagenet` '
|
|
'(pre-training on ImageNet), '
|
|
'or the path to the weights file to be loaded.')
|
|
|
|
if weights == 'imagenet' and include_top and classes != 1000:
|
|
raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
|
|
'as true, `classes` should be 1000')
|
|
|
|
# Determine proper input shape and default size.
|
|
# If both input_shape and input_tensor are used, they should match
|
|
if input_shape is not None and input_tensor is not None:
|
|
try:
|
|
is_input_t_tensor = backend.is_keras_tensor(input_tensor)
|
|
except ValueError:
|
|
try:
|
|
is_input_t_tensor = backend.is_keras_tensor(
|
|
layer_utils.get_source_inputs(input_tensor))
|
|
except ValueError:
|
|
raise ValueError('input_tensor: ', input_tensor,
|
|
'is not type input_tensor')
|
|
if is_input_t_tensor:
|
|
if backend.image_data_format() == 'channels_first':
|
|
if backend.int_shape(input_tensor)[1] != input_shape[1]:
|
|
raise ValueError('input_shape: ', input_shape, 'and input_tensor: ',
|
|
input_tensor,
|
|
'do not meet the same shape requirements')
|
|
else:
|
|
if backend.int_shape(input_tensor)[2] != input_shape[1]:
|
|
raise ValueError('input_shape: ', input_shape, 'and input_tensor: ',
|
|
input_tensor,
|
|
'do not meet the same shape requirements')
|
|
else:
|
|
raise ValueError('input_tensor specified: ', input_tensor,
|
|
'is not a keras tensor')
|
|
|
|
# If input_shape is None, infer shape from input_tensor
|
|
if input_shape is None and input_tensor is not None:
|
|
|
|
try:
|
|
backend.is_keras_tensor(input_tensor)
|
|
except ValueError:
|
|
raise ValueError('input_tensor: ', input_tensor, 'is type: ',
|
|
type(input_tensor), 'which is not a valid type')
|
|
|
|
if input_shape is None and not backend.is_keras_tensor(input_tensor):
|
|
default_size = 224
|
|
elif input_shape is None and backend.is_keras_tensor(input_tensor):
|
|
if backend.image_data_format() == 'channels_first':
|
|
rows = backend.int_shape(input_tensor)[2]
|
|
cols = backend.int_shape(input_tensor)[3]
|
|
else:
|
|
rows = backend.int_shape(input_tensor)[1]
|
|
cols = backend.int_shape(input_tensor)[2]
|
|
|
|
if rows == cols and rows in [96, 128, 160, 192, 224]:
|
|
default_size = rows
|
|
else:
|
|
default_size = 224
|
|
|
|
# If input_shape is None and no input_tensor
|
|
elif input_shape is None:
|
|
default_size = 224
|
|
|
|
# If input_shape is not None, assume default size
|
|
else:
|
|
if backend.image_data_format() == 'channels_first':
|
|
rows = input_shape[1]
|
|
cols = input_shape[2]
|
|
else:
|
|
rows = input_shape[0]
|
|
cols = input_shape[1]
|
|
|
|
if rows == cols and rows in [96, 128, 160, 192, 224]:
|
|
default_size = rows
|
|
else:
|
|
default_size = 224
|
|
|
|
input_shape = imagenet_utils.obtain_input_shape(
|
|
input_shape,
|
|
default_size=default_size,
|
|
min_size=32,
|
|
data_format=backend.image_data_format(),
|
|
require_flatten=include_top,
|
|
weights=weights)
|
|
|
|
if backend.image_data_format() == 'channels_last':
|
|
row_axis, col_axis = (0, 1)
|
|
else:
|
|
row_axis, col_axis = (1, 2)
|
|
rows = input_shape[row_axis]
|
|
cols = input_shape[col_axis]
|
|
|
|
if weights == 'imagenet':
|
|
if alpha not in [0.35, 0.50, 0.75, 1.0, 1.3, 1.4]:
|
|
raise ValueError('If imagenet weights are being loaded, '
|
|
'alpha can be one of `0.35`, `0.50`, `0.75`, '
|
|
'`1.0`, `1.3` or `1.4` only.')
|
|
|
|
if rows != cols or rows not in [96, 128, 160, 192, 224]:
|
|
rows = 224
|
|
logging.warning('`input_shape` is undefined or non-square, '
|
|
'or `rows` is not in [96, 128, 160, 192, 224].'
|
|
' Weights for input shape (224, 224) will be'
|
|
' loaded as the default.')
|
|
|
|
if input_tensor is None:
|
|
img_input = layers.Input(shape=input_shape)
|
|
else:
|
|
if not backend.is_keras_tensor(input_tensor):
|
|
img_input = layers.Input(tensor=input_tensor, shape=input_shape)
|
|
else:
|
|
img_input = input_tensor
|
|
|
|
channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
|
|
|
|
first_block_filters = _make_divisible(32 * alpha, 8)
|
|
x = layers.Conv2D(
|
|
first_block_filters,
|
|
kernel_size=3,
|
|
strides=(2, 2),
|
|
padding='same',
|
|
use_bias=False,
|
|
name='Conv1')(img_input)
|
|
x = layers.BatchNormalization(
|
|
axis=channel_axis, epsilon=1e-3, momentum=0.999, name='bn_Conv1')(
|
|
x)
|
|
x = layers.ReLU(6., name='Conv1_relu')(x)
|
|
|
|
x = _inverted_res_block(
|
|
x, filters=16, alpha=alpha, stride=1, expansion=1, block_id=0)
|
|
|
|
x = _inverted_res_block(
|
|
x, filters=24, alpha=alpha, stride=2, expansion=6, block_id=1)
|
|
x = _inverted_res_block(
|
|
x, filters=24, alpha=alpha, stride=1, expansion=6, block_id=2)
|
|
|
|
x = _inverted_res_block(
|
|
x, filters=32, alpha=alpha, stride=2, expansion=6, block_id=3)
|
|
x = _inverted_res_block(
|
|
x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=4)
|
|
x = _inverted_res_block(
|
|
x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=5)
|
|
|
|
x = _inverted_res_block(
|
|
x, filters=64, alpha=alpha, stride=2, expansion=6, block_id=6)
|
|
x = _inverted_res_block(
|
|
x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=7)
|
|
x = _inverted_res_block(
|
|
x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=8)
|
|
x = _inverted_res_block(
|
|
x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=9)
|
|
|
|
x = _inverted_res_block(
|
|
x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=10)
|
|
x = _inverted_res_block(
|
|
x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=11)
|
|
x = _inverted_res_block(
|
|
x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=12)
|
|
|
|
x = _inverted_res_block(
|
|
x, filters=160, alpha=alpha, stride=2, expansion=6, block_id=13)
|
|
x = _inverted_res_block(
|
|
x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=14)
|
|
x = _inverted_res_block(
|
|
x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=15)
|
|
|
|
x = _inverted_res_block(
|
|
x, filters=320, alpha=alpha, stride=1, expansion=6, block_id=16)
|
|
|
|
# no alpha applied to last conv as stated in the paper:
|
|
# if the width multiplier is greater than 1 we
|
|
# increase the number of output channels
|
|
if alpha > 1.0:
|
|
last_block_filters = _make_divisible(1280 * alpha, 8)
|
|
else:
|
|
last_block_filters = 1280
|
|
|
|
x = layers.Conv2D(
|
|
last_block_filters, kernel_size=1, use_bias=False, name='Conv_1')(
|
|
x)
|
|
x = layers.BatchNormalization(
|
|
axis=channel_axis, epsilon=1e-3, momentum=0.999, name='Conv_1_bn')(
|
|
x)
|
|
x = layers.ReLU(6., name='out_relu')(x)
|
|
|
|
if include_top:
|
|
x = layers.GlobalAveragePooling2D()(x)
|
|
imagenet_utils.validate_activation(classifier_activation, weights)
|
|
x = layers.Dense(classes, activation=classifier_activation,
|
|
name='predictions')(x)
|
|
|
|
else:
|
|
if pooling == 'avg':
|
|
x = layers.GlobalAveragePooling2D()(x)
|
|
elif pooling == 'max':
|
|
x = layers.GlobalMaxPooling2D()(x)
|
|
|
|
# Ensure that the model takes into account
|
|
# any potential predecessors of `input_tensor`.
|
|
if input_tensor is not None:
|
|
inputs = layer_utils.get_source_inputs(input_tensor)
|
|
else:
|
|
inputs = img_input
|
|
|
|
# Create model.
|
|
model = training.Model(inputs, x, name='mobilenetv2_%0.2f_%s' % (alpha, rows))
|
|
|
|
# Load weights.
|
|
if weights == 'imagenet':
|
|
if include_top:
|
|
model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' +
|
|
str(alpha) + '_' + str(rows) + '.h5')
|
|
weight_path = BASE_WEIGHT_PATH + model_name
|
|
weights_path = data_utils.get_file(
|
|
model_name, weight_path, cache_subdir='models')
|
|
else:
|
|
model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' +
|
|
str(alpha) + '_' + str(rows) + '_no_top' + '.h5')
|
|
weight_path = BASE_WEIGHT_PATH + model_name
|
|
weights_path = data_utils.get_file(
|
|
model_name, weight_path, cache_subdir='models')
|
|
model.load_weights(weights_path)
|
|
elif weights is not None:
|
|
model.load_weights(weights)
|
|
|
|
return model
|
|
|
|
|
|
def _inverted_res_block(inputs, expansion, stride, alpha, filters, block_id):
|
|
"""Inverted ResNet block."""
|
|
channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
|
|
|
|
in_channels = backend.int_shape(inputs)[channel_axis]
|
|
pointwise_conv_filters = int(filters * alpha)
|
|
pointwise_filters = _make_divisible(pointwise_conv_filters, 8)
|
|
x = inputs
|
|
prefix = 'block_{}_'.format(block_id)
|
|
|
|
if block_id:
|
|
# Expand
|
|
x = layers.Conv2D(
|
|
expansion * in_channels,
|
|
kernel_size=1,
|
|
padding='same',
|
|
use_bias=False,
|
|
activation=None,
|
|
name=prefix + 'expand')(
|
|
x)
|
|
x = layers.BatchNormalization(
|
|
axis=channel_axis,
|
|
epsilon=1e-3,
|
|
momentum=0.999,
|
|
name=prefix + 'expand_BN')(
|
|
x)
|
|
x = layers.ReLU(6., name=prefix + 'expand_relu')(x)
|
|
else:
|
|
prefix = 'expanded_conv_'
|
|
|
|
# Depthwise
|
|
if stride == 2:
|
|
x = layers.ZeroPadding2D(
|
|
padding=imagenet_utils.correct_pad(x, 3),
|
|
name=prefix + 'pad')(x)
|
|
x = layers.DepthwiseConv2D(
|
|
kernel_size=3,
|
|
strides=stride,
|
|
activation=None,
|
|
use_bias=False,
|
|
padding='same' if stride == 1 else 'valid',
|
|
name=prefix + 'depthwise')(
|
|
x)
|
|
x = layers.BatchNormalization(
|
|
axis=channel_axis,
|
|
epsilon=1e-3,
|
|
momentum=0.999,
|
|
name=prefix + 'depthwise_BN')(
|
|
x)
|
|
|
|
x = layers.ReLU(6., name=prefix + 'depthwise_relu')(x)
|
|
|
|
# Project
|
|
x = layers.Conv2D(
|
|
pointwise_filters,
|
|
kernel_size=1,
|
|
padding='same',
|
|
use_bias=False,
|
|
activation=None,
|
|
name=prefix + 'project')(
|
|
x)
|
|
x = layers.BatchNormalization(
|
|
axis=channel_axis,
|
|
epsilon=1e-3,
|
|
momentum=0.999,
|
|
name=prefix + 'project_BN')(
|
|
x)
|
|
|
|
if in_channels == pointwise_filters and stride == 1:
|
|
return layers.Add(name=prefix + 'add')([inputs, x])
|
|
return x
|
|
|
|
|
|
def _make_divisible(v, divisor, min_value=None):
|
|
if min_value is None:
|
|
min_value = divisor
|
|
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
|
# Make sure that round down does not go down by more than 10%.
|
|
if new_v < 0.9 * v:
|
|
new_v += divisor
|
|
return new_v
|
|
|
|
|
|
@keras_export('keras.applications.mobilenet_v2.preprocess_input')
|
|
def preprocess_input(x, data_format=None):
|
|
return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
|
|
|
|
|
|
@keras_export('keras.applications.mobilenet_v2.decode_predictions')
|
|
def decode_predictions(preds, top=5):
|
|
return imagenet_utils.decode_predictions(preds, top=top)
|
|
|
|
|
|
preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
|
|
mode='',
|
|
ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
|
|
error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
|
|
decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
|