Fix formatting of file

PiperOrigin-RevId: 312408716
Change-Id: I63f427c3453745008b159afc7a459df63b0ec8d0
This commit is contained in:
Gaurav Jain 2020-05-19 20:31:31 -07:00 committed by TensorFlower Gardener
parent 361470d24a
commit f8a797e13e
1 changed files with 108 additions and 132 deletions

View File

@ -12,8 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Normalization layers.
"""
"""Normalization layers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
@ -43,7 +42,7 @@ from tensorflow.python.util.tf_export import keras_export
class BatchNormalizationBase(Layer):
r"""Normalize and scale inputs or activations. (Ioffe and Szegedy, 2014).
r"""Normalize and scale inputs or activations.
Normalize the activations of the previous layer at each batch,
i.e. applies a transformation that maintains the mean activation
@ -65,20 +64,16 @@ class BatchNormalizationBase(Layer):
`training=False` when calling the model, or using `model.predict`.
Arguments:
axis: Integer, the axis that should be normalized
(typically the features axis).
For instance, after a `Conv2D` layer with
`data_format="channels_first"`,
set `axis=1` in `BatchNormalization`.
axis: Integer, the axis that should be normalized (typically the features
axis). For instance, after a `Conv2D` layer with
`data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
momentum: Momentum for the moving average.
epsilon: Small float added to variance to avoid dividing by zero.
center: If True, add offset of `beta` to normalized tensor.
If False, `beta` is ignored.
scale: If True, multiply by `gamma`.
If False, `gamma` is not used.
When the next layer is linear (also e.g. `nn.relu`),
this can be disabled since the scaling
will be done by the next layer.
center: If True, add offset of `beta` to normalized tensor. If False, `beta`
is ignored.
scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
next layer is linear (also e.g. `nn.relu`), this can be disabled since the
scaling will be done by the next layer.
beta_initializer: Initializer for the beta weight.
gamma_initializer: Initializer for the gamma weight.
moving_mean_initializer: Initializer for the moving mean.
@ -89,17 +84,17 @@ class BatchNormalizationBase(Layer):
gamma_constraint: Optional constraint for the gamma weight.
renorm: Whether to use [Batch Renormalization](
https://arxiv.org/abs/1702.03275). This adds extra variables during
training. The inference is the same for either value of this parameter.
training. The inference is the same for either value of this parameter.
renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
scalar `Tensors` used to clip the renorm correction. The correction
`(r, d)` is used as `corrected_value = normalized_value * r + d`, with
`r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
scalar `Tensors` used to clip the renorm correction. The correction `(r,
d)` is used as `corrected_value = normalized_value * r + d`, with `r`
clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
dmax are set to inf, 0, inf, respectively.
renorm_momentum: Momentum used to update the moving means and standard
deviations with renorm. Unlike `momentum`, this affects training
and should be neither too small (which would add noise) nor too large
(which would give stale estimates). Note that `momentum` is still applied
to get the means and variances for inference.
deviations with renorm. Unlike `momentum`, this affects training and
should be neither too small (which would add noise) nor too large (which
would give stale estimates). Note that `momentum` is still applied to get
the means and variances for inference.
fused: if `True`, use a faster, fused implementation, or raise a ValueError
if the fused implementation cannot be used. If `None`, use the faster
implementation if possible. If False, do not used the fused
@ -117,54 +112,36 @@ class BatchNormalizationBase(Layer):
example, if axis==-1,
`adjustment = lambda shape: (
tf.random.uniform(shape[-1:], 0.93, 1.07),
tf.random.uniform(shape[-1:], -0.1, 0.1))`
will scale the normalized value by up to 7% up or down, then shift the
result by up to 0.1 (with independent scaling and bias for each feature
but shared across all examples), and finally apply gamma and/or beta. If
`None`, no adjustment is applied. Cannot be specified if
virtual_batch_size is specified.
tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
value by up to 7% up or down, then shift the result by up to 0.1
(with independent scaling and bias for each feature but shared
across all examples), and finally apply gamma and/or beta. If
`None`, no adjustment is applied. Cannot be specified if
virtual_batch_size is specified.
Call arguments:
inputs: Input tensor (of any rank).
training: Python boolean indicating whether the layer should behave in
training mode or in inference mode.
- `training=True`: The layer will normalize its inputs using the
mean and variance of the current batch of inputs.
- `training=False`: The layer will normalize its inputs using the
mean and variance of its moving statistics, learned during training.
Input shape:
Arbitrary. Use the keyword argument `input_shape`
(tuple of integers, does not include the samples axis)
when using this layer as the first layer in a model.
Output shape:
Same shape as input.
{{TRAINABLE_ATTRIBUTE_NOTE}}
Normalization equations:
Consider the intermediate activations \(x\) of a mini-batch of size
\\(m\\):
We can compute the mean and variance of the batch
\\({\mu_B} = \frac{1}{m} \sum_{i=1}^{m} {x_i}\\)
\\({\sigma_B^2} = \frac{1}{m} \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\)
and then compute a normalized \\(x\\), including a small factor
\\({\epsilon}\\) for numerical stability.
\\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 + \epsilon}}\\)
And finally \\(\hat{x}\) is linearly transformed by \({\gamma}\\)
and \\({\beta}\\), which are learned parameters:
\\({y_i} = {\gamma * \hat{x_i} + \beta}\\)
- `training=True`: The layer will normalize its inputs using the mean and
variance of the current batch of inputs.
- `training=False`: The layer will normalize its inputs using the mean and
variance of its moving statistics, learned during training.
Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
integers, does not include the samples axis) when using this layer as the
first layer in a model.
Output shape: Same shape as input. {{TRAINABLE_ATTRIBUTE_NOTE}}
Normalization equations: Consider the intermediate activations \(x\) of a
mini-batch of size
\\(m\\): We can compute the mean and variance of the batch \\({\mu_B} =
\frac{1}{m} \sum_{i=1}^{m} {x_i}\\) \\({\sigma_B^2} = \frac{1}{m}
\sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\) and then compute a normalized
\\(x\\), including a small factor \\({\epsilon}\\) for numerical
stability. \\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 +
\epsilon}}\\) And finally \\(\hat{x}\) is linearly transformed by
\({\gamma}\\)
and \\({\beta}\\), which are learned parameters: \\({y_i} = {\gamma *
\hat{x_i} + \beta}\\)
Reference:
- [Ioffe and Szegedy, 2015](https://arxiv.org/abs/1502.03167).
"""
@ -195,8 +172,7 @@ class BatchNormalizationBase(Layer):
adjustment=None,
name=None,
**kwargs):
super(BatchNormalizationBase, self).__init__(
name=name, **kwargs)
super(BatchNormalizationBase, self).__init__(name=name, **kwargs)
if isinstance(axis, (list, tuple)):
self.axis = axis[:]
elif isinstance(axis, int):
@ -275,8 +251,8 @@ class BatchNormalizationBase(Layer):
# TODO(reedwm): Support fp64 in FusedBatchNorm then remove this check.
if self._compute_dtype not in ('float16', 'bfloat16', 'float32', None):
raise ValueError('Passing fused=True is only supported when the compute '
'dtype is float16, bfloat16, or float32. Got dtype: %s'
% (self._compute_dtype,))
'dtype is float16, bfloat16, or float32. Got dtype: %s' %
(self._compute_dtype,))
def _fused_can_be_used(self):
try:
@ -380,13 +356,14 @@ class BatchNormalizationBase(Layer):
param_shape = (list(axis_to_dim.values())[0],)
else:
# Parameter shape is the original shape but with 1 in all non-axis dims
param_shape = [axis_to_dim[i] if i in axis_to_dim
else 1 for i in range(ndims)]
param_shape = [
axis_to_dim[i] if i in axis_to_dim else 1 for i in range(ndims)
]
if self.virtual_batch_size is not None:
# When using virtual batches, add an extra dim at index 1
param_shape.insert(1, 1)
for idx, x in enumerate(self.axis):
self.axis[idx] = x + 1 # Account for added dimension
self.axis[idx] = x + 1 # Account for added dimension
if self.scale:
self.gamma = self.add_weight(
@ -507,8 +484,7 @@ class BatchNormalizationBase(Layer):
decay = ops.convert_to_tensor_v2(1.0 - momentum, name='decay')
if decay.dtype != variable.dtype.base_dtype:
decay = math_ops.cast(decay, variable.dtype.base_dtype)
update_delta = (
variable - math_ops.cast(value, variable.dtype)) * decay
update_delta = (variable - math_ops.cast(value, variable.dtype)) * decay
if inputs_size is not None:
update_delta = array_ops.where(inputs_size > 0, update_delta,
K.zeros_like(update_delta))
@ -650,8 +626,9 @@ class BatchNormalizationBase(Layer):
with ops.control_dependencies([r, d]):
mean = array_ops.identity(mean)
stddev = array_ops.identity(stddev)
rmin, rmax, dmax = [self.renorm_clipping.get(key)
for key in ['rmin', 'rmax', 'dmax']]
rmin, rmax, dmax = [
self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
]
if rmin is not None:
r = math_ops.maximum(r, rmin)
if rmax is not None:
@ -661,13 +638,13 @@ class BatchNormalizationBase(Layer):
d = math_ops.minimum(d, dmax)
# When not training, use r=1, d=0.
r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
d = tf_utils.smart_cond(training,
lambda: d,
d = tf_utils.smart_cond(training, lambda: d,
lambda: array_ops.zeros_like(d))
def _update_renorm_variable(var, value, inputs_size):
"""Updates a moving average and weight, returns the unbiased value."""
value = array_ops.identity(value)
def _do_update():
"""Updates the var, returns the updated value."""
new_var = self._assign_moving_average(var, value, self.renorm_momentum,
@ -676,6 +653,7 @@ class BatchNormalizationBase(Layer):
def _fake_update():
return array_ops.identity(var)
return tf_utils.smart_cond(training, _do_update, _fake_update)
# TODO(yuefengz): colocate the operations
@ -753,12 +731,13 @@ class BatchNormalizationBase(Layer):
ndims = len(input_shape)
reduction_axes = [i for i in range(ndims) if i not in self.axis]
if self.virtual_batch_size is not None:
del reduction_axes[1] # Do not reduce along virtual batch dim
del reduction_axes[1] # Do not reduce along virtual batch dim
# Broadcasting only necessary for single-axis batch norm where the axis is
# not the last dimension
broadcast_shape = [1] * ndims
broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
def _broadcast(v):
if (v is not None and len(v.shape) != ndims and
reduction_axes != list(range(ndims - 1))):
@ -783,11 +762,9 @@ class BatchNormalizationBase(Layer):
if self.adjustment:
adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
# Adjust only during training.
adj_scale = tf_utils.smart_cond(training,
lambda: adj_scale,
adj_scale = tf_utils.smart_cond(training, lambda: adj_scale,
lambda: array_ops.ones_like(adj_scale))
adj_bias = tf_utils.smart_cond(training,
lambda: adj_bias,
adj_bias = tf_utils.smart_cond(training, lambda: adj_bias,
lambda: array_ops.zeros_like(adj_bias))
scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
@ -879,11 +856,8 @@ class BatchNormalizationBase(Layer):
scale = math_ops.cast(scale, inputs.dtype)
# TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing
# math in float16 hurts validation accuracy of popular models like resnet.
outputs = nn.batch_normalization(inputs,
_broadcast(mean),
_broadcast(variance),
offset,
scale,
outputs = nn.batch_normalization(inputs, _broadcast(mean),
_broadcast(variance), offset, scale,
self.epsilon)
# If some components of the shape got lost due to adjustments, fix that.
outputs.set_shape(input_shape)
@ -897,21 +871,32 @@ class BatchNormalizationBase(Layer):
def get_config(self):
config = {
'axis': self.axis,
'momentum': self.momentum,
'epsilon': self.epsilon,
'center': self.center,
'scale': self.scale,
'beta_initializer': initializers.serialize(self.beta_initializer),
'gamma_initializer': initializers.serialize(self.gamma_initializer),
'axis':
self.axis,
'momentum':
self.momentum,
'epsilon':
self.epsilon,
'center':
self.center,
'scale':
self.scale,
'beta_initializer':
initializers.serialize(self.beta_initializer),
'gamma_initializer':
initializers.serialize(self.gamma_initializer),
'moving_mean_initializer':
initializers.serialize(self.moving_mean_initializer),
'moving_variance_initializer':
initializers.serialize(self.moving_variance_initializer),
'beta_regularizer': regularizers.serialize(self.beta_regularizer),
'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
'beta_constraint': constraints.serialize(self.beta_constraint),
'gamma_constraint': constraints.serialize(self.gamma_constraint)
'beta_regularizer':
regularizers.serialize(self.beta_regularizer),
'gamma_regularizer':
regularizers.serialize(self.gamma_regularizer),
'beta_constraint':
constraints.serialize(self.beta_constraint),
'gamma_constraint':
constraints.serialize(self.gamma_constraint)
}
# Only add TensorFlow-specific parameters if they are set, so as to preserve
# model compatibility with external Keras.
@ -942,16 +927,14 @@ def replace_in_base_docstring(replacements):
@keras_export(v1=['keras.layers.BatchNormalization']) # pylint: disable=missing-docstring
class BatchNormalization(BatchNormalizationBase):
__doc__ = replace_in_base_docstring(
[('''
__doc__ = replace_in_base_docstring([("""
fused: if `True`, use a faster, fused implementation, or raise a ValueError
if the fused implementation cannot be used. If `None`, use the faster
implementation if possible. If False, do not used the fused
implementation.''',
'''
implementation.""", """
fused: if `None` or `True`, use a faster, fused implementation if possible.
If `False`, use the system recommended implementation.'''),
('{{TRAINABLE_ATTRIBUTE_NOTE}}', '')])
If `False`, use the system recommended implementation."""),
('{{TRAINABLE_ATTRIBUTE_NOTE}}', '')])
_USE_V2_BEHAVIOR = False
@ -1048,37 +1031,30 @@ class LayerNormalization(Layer):
Arguments:
axis: Integer or List/Tuple. The axis or axes
to normalize across. Typically this is the features axis/axes. The
left-out axes are typically the batch axis/axes.
This argument defaults to `-1`, the last dimension in the input.
epsilon: Small float added to variance to avoid dividing by zero.
Defaults to 1e-3
center: If True, add offset of `beta` to normalized tensor.
If False, `beta` is ignored. Defaults to True.
scale: If True, multiply by `gamma`.
If False, `gamma` is not used. Defaults to True.
When the next layer is linear (also e.g. `nn.relu`),
this can be disabled since the scaling
will be done by the next layer.
axis: Integer or List/Tuple. The axis or axes to normalize across. Typically
this is the features axis/axes. The left-out axes are typically the batch
axis/axes. This argument defaults to `-1`, the last dimension in the
input.
epsilon: Small float added to variance to avoid dividing by zero. Defaults
to 1e-3
center: If True, add offset of `beta` to normalized tensor. If False, `beta`
is ignored. Defaults to True.
scale: If True, multiply by `gamma`. If False, `gamma` is not used. Defaults
to True. When the next layer is linear (also e.g. `nn.relu`), this can be
disabled since the scaling will be done by the next layer.
beta_initializer: Initializer for the beta weight. Defaults to zeros.
gamma_initializer: Initializer for the gamma weight. Defaults to ones.
beta_regularizer: Optional regularizer for the beta weight. None by default.
gamma_regularizer: Optional regularizer for the gamma weight.
None by default.
gamma_regularizer: Optional regularizer for the gamma weight. None by
default.
beta_constraint: Optional constraint for the beta weight. None by default.
gamma_constraint: Optional constraint for the gamma weight. None by default.
trainable: Boolean, if `True` the variables will be marked as trainable.
Defaults to True.
Input shape:
Arbitrary. Use the keyword argument `input_shape`
(tuple of integers, does not include the samples axis)
when using this layer as the first layer in a model.
Output shape:
Same shape as input.
Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
integers, does not include the samples axis) when using this layer as the
first layer in a model.
Output shape: Same shape as input.
Reference:
- [Lei Ba et al., 2016](https://arxiv.org/abs/1607.06450).
"""
@ -1204,9 +1180,9 @@ class LayerNormalization(Layer):
broadcast_shape = [1] * ndims
for dim in self.axis:
broadcast_shape[dim] = input_shape.dims[dim].value
def _broadcast(v):
if (v is not None and len(v.shape) != ndims and
self.axis != [ndims - 1]):
if (v is not None and len(v.shape) != ndims and self.axis != [ndims - 1]):
return array_ops.reshape(v, broadcast_shape)
return v