Merge pull request #27655 from MattConley:loss_scaling_optimizer

PiperOrigin-RevId: 243664104
2019-04-15 12:37:36 -07:00 · 2019-04-15 12:37:36 -07:00 · 447e512d33
commit 447e512d33
parent 0069a9b092 0f3d3df0a7
10 changed files with 1229 additions and 1 deletions
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -2858,6 +2858,53 @@ py_library(
    ],
 )

+py_library(
+    name = "loss_scale",
+    srcs = ["training/experimental/loss_scale.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "loss_scale_optimizer",
+    srcs = ["training/experimental/loss_scale_optimizer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loss_scale",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "loss_scale_optimizer_test",
+    size = "small",
+    srcs = ["training/experimental/loss_scale_optimizer_test.py"],
+    deps = [
+        ":loss_scale_optimizer",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:one_device_strategy",
+        "//tensorflow/python/keras/mixed_precision/experimental:test_util",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "loss_scale_test",
+    size = "small",
+    srcs = ["training/experimental/loss_scale_test.py"],
+    deps = [
+        ":loss_scale",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:one_device_strategy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_library(
    name = "math_grad",
    srcs = ["ops/math_grad.py"],
@ -3962,6 +4009,8 @@ py_library(
        ":io_ops",
        ":layers_util",
        ":lookup_ops",
+        ":loss_scale",
+        ":loss_scale_optimizer",
        ":math_ops",
        ":platform",
        ":pywrap_tensorflow",
--- a/tensorflow/python/training/experimental/loss_scale.py
+++ b/tensorflow/python/training/experimental/loss_scale.py
@ -0,0 +1,352 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains LossScale classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.ops import variable_scope
+
+
+# TODO(reedwm): Merge this with tf.keras.mixed_precision.experimental.LossScale
+@six.add_metaclass(abc.ABCMeta)
+class LossScale(trackable.Trackable):
+  """Loss scale base class.
+
+  Instances of this class represent a loss scale. Calling instances of this
+  class returns the loss scale as a scalar float32 tensor, while method
+  `update()` updates the loss scale depending on the values of the gradients.
+  Optimizers use instances of this class to scale loss and gradients.
+
+  Note: this LossScale class can only be used with a v1 optimizer wrapper,
+  tf.train.experimental.MixedPrecisionLossScaleOptimizer. For a v2
+  wrapper, tf.keras.mixed_precision.experimental.LossScaleOptimizer, a
+  tf.keras.mixed_precision.experimental.LossScale should be used instead.
+  """
+
+  def __init__(self):
+    """Initializes the loss scale class."""
+    self._weights = {}
+
+  @abc.abstractmethod
+  def __call__(self):
+    """Returns the current loss scale as a scalar `float32` tensor."""
+    pass
+
+  @abc.abstractmethod
+  def update(self, grads):
+    """Updates the value of the loss scale.
+
+    The loss scale will be potentially updated, based on the value of `grads`.
+    The tensor returned by calling this class is only updated when this function
+    is evaluated.
+
+    In eager mode, this directly updates the loss scale, so that calling
+    `__call__` will return the newly updated loss scale. In graph mode,
+    this returns an op that, when evaluated, updates the loss scale.
+
+    This function also returns a `should_apply_gradients` bool. If False,
+    gradients should not be applied to the variables that step, as nonfinite
+    gradients were found, and the loss scale has been be updated to reduce the
+    chance of finding nonfinite gradients in the next step. Some loss scale
+    classes will always return True, as they cannot adjust themselves in
+    response to nonfinite gradients.
+
+    When a DistributionStrategy is used, this function may only be called in a
+    cross-replica context.
+
+    Args:
+      grads: A list of unscaled gradients, each which is the gradient of the
+        loss with respect to a weight. The gradients should have already been
+        divided by the loss scale being before passed to this function.
+
+    Returns:
+      update_op: In eager mode, None. In graph mode, an op to update the loss
+        scale.
+      should_apply_gradients: Either a bool or a scalar boolean tensor. If
+        False, the caller should skip applying `grads` to the variables this
+        step.
+    """
+    pass
+
+  def _add_weight(self, name, initial_value, dtype=None):
+    """Adds a weight to this loss scale manager..
+
+    Args:
+      name: Variable name.
+      initial_value: The variable's initial value.
+      dtype: The type of the variable.
+
+    Returns:
+      A variable.
+
+    Raises:
+      RuntimeError: If a weight with `name` has already been added.
+    """
+    variable = variable_scope.variable(
+        initial_value=initial_value,
+        name=name,
+        dtype=dtype,
+        trainable=False,
+        use_resource=True,
+        synchronization=variables.VariableSynchronization.AUTO,
+        # Set aggregation to NONE, as loss scaling variables should never be
+        # aggregated.
+        aggregation=variables.VariableAggregation.NONE)
+    if context.executing_eagerly():
+      graph_key = None
+    else:
+      graph = ops.get_default_graph()
+      graph_key = graph._graph_key  # pylint: disable=protected-access
+
+    key = (name, graph_key)
+    if self._weights.get(key, None) is not None:
+      raise RuntimeError('Duplicate variables detected. {}'.format(key))
+    self._weights[key] = variable
+    self._handle_deferred_dependencies(name=name, trackable=variable)
+    return variable
+
+  @property
+  def _checkpoint_dependencies(self):
+    """From Trackable. Gather graph-specific weights to save."""
+    if context.executing_eagerly():
+      graph_key = None
+    else:
+      graph = ops.get_default_graph()
+      graph_key = graph._graph_key  # pylint: disable=protected-access
+    weights = []
+    for (name, g), v in sorted(self._weights.items(), key=lambda i: i[0][0]):
+      if g == graph_key:
+        weights.append(trackable.TrackableReference(name=name, ref=v))
+    return super(LossScale, self)._checkpoint_dependencies + weights
+
+  def _lookup_dependency(self, name):
+    """From Trackable. Find a weight in the current graph."""
+    unconditional = super(LossScale, self)._lookup_dependency(name)
+    if unconditional is not None:
+      return unconditional
+    if context.executing_eagerly():
+      graph_key = None
+    else:
+      graph = ops.get_default_graph()
+      graph_key = graph._graph_key  # pylint: disable=protected-access
+    return self._weights.get((name, graph_key), None)
+
+
+class FixedLossScale(LossScale):
+  """Loss scale class with a fixed value.
+
+  The loss scale is not updated for the lifetime of the class.
+  """
+
+  def __init__(self, loss_scale_value):
+    """Creates the fixed loss scale.
+
+    Args:
+      loss_scale_value: A Python float. Its ideal value varies depending on
+        models to run. Choosing a too small loss_scale might affect model
+        quality; a too big loss_scale might cause inf or nan. There is no single
+        right loss_scale to apply. There is no harm choosing a relatively big
+        number as long as no nan or inf is encountered in training.
+
+    Raises:
+      ValueError: If loss_scale is less than 1.
+    """
+    super(FixedLossScale, self).__init__()
+    if not isinstance(loss_scale_value, six.integer_types + (float,)):
+      raise ValueError('loss_scale must be a Python int or float.')
+    if loss_scale_value < 1:
+      raise ValueError('loss scale must be at least 1.')
+    self._tensor_loss_scale = ops.convert_to_tensor(
+        loss_scale_value, dtype=dtypes.float32)
+
+  def __call__(self):
+    return self._tensor_loss_scale
+
+  def update(self, grads):
+    del grads
+    return control_flow_ops.no_op(), True
+
+
+def _is_all_finite(grads):
+  """Returns a scalar boolean tensor indicating if all gradients are finite."""
+  is_finite_per_grad = [
+      math_ops.reduce_all(math_ops.is_finite(g)) for g in grads
+  ]
+  return math_ops.reduce_all(is_finite_per_grad)
+
+
+def _op_in_graph_mode(tensor):
+  """Returns the tensor's op in graph mode, or the tensor in eager mode.
+
+  This is useful because sometimes an op is needed in graph mode instead of a
+  tensor. In eager mode, there are no ops.
+
+  Args:
+    tensor: A tensor.
+
+  Returns:
+    The tensor's op in graph mode. The tensor in eager mode.
+  """
+  if context.executing_eagerly():
+    return tensor
+  return tensor.op
+
+
+def _assign_if_finite(var, value):
+  """Assigns a value to a variable if the value is finite."""
+  return control_flow_ops.cond(
+      math_ops.is_finite(value), lambda: _op_in_graph_mode(var.assign(value)),
+      control_flow_ops.no_op)
+
+
+class DynamicLossScale(LossScale):
+  """Loss scale class that dynamically adjusts the loss scale.
+
+  Dynamic loss scaling works by adjusting the loss scale as training progresses.
+  The goal is to keep the loss scale as high as possible without overflowing the
+  gradients. As long as the gradients do not overflow, raising the loss scale
+  never hurts.
+
+  The algorithm starts by setting the loss scale to an initial value. Every N
+  steps that the gradients are finite, the loss scale is increased by some
+  factor. However, if a NaN or Inf gradient is found, the gradients for that
+  step are not applied, and the loss scale is decreased by the factor. This
+  process tends to keep the loss scale as high as possible without gradients
+  overflowing.
+  """
+
+  def __init__(self,
+               initial_loss_scale=2**15,
+               increment_period=2000,
+               multiplier=2.):
+    """Constructor of exponential-update loss scale class.
+
+    Args:
+      initial_loss_scale: A Python float.  The loss scale to use at the
+        beginning. It's better to start this at a very high number, because a
+        loss scale that is too high gets lowered far more quickly than a loss
+        scale that is to low gets raised. The default is 2 ** 15, which is
+        approximately half the maximum float16 value.
+      increment_period: Increases loss scale every `increment_period`
+        consecutive steps that finite gradients are encountered. If a nonfinite
+        gradient is encountered, the count is reset back to zero.
+      multiplier: The multiplier to use when increasing or decreasing the loss
+        scale.
+    """
+    super(DynamicLossScale, self).__init__()
+    self._initial_loss_scale = float(initial_loss_scale)
+    self._increment_period = int(increment_period)
+    self._multiplier = float(multiplier)
+
+    self._current_loss_scale = self._add_weight(
+        name='loss_scale',
+        dtype=dtypes.float32,
+        initial_value=self._initial_loss_scale)
+    # The number of consecutive steps with finite gradients since the last
+    # nonfinite gradient or change in loss scale.
+    self._num_good_steps = self._add_weight(
+        name='good_steps', dtype=dtypes.int64, initial_value=0)
+
+  @property
+  def initial_loss_scale(self):
+    return self._initial_loss_scale
+
+  @property
+  def increment_period(self):
+    return self._increment_period
+
+  @property
+  def multiplier(self):
+    return self._multiplier
+
+  def __call__(self):
+    return self._current_loss_scale
+
+  def update(self, grads):
+    """Updates loss scale based on if gradients are finite in current step."""
+    if distribution_strategy_context.has_strategy():
+      distribution = distribution_strategy_context.get_cross_replica_context()
+
+      def get_is_finite(grads):
+        is_finite = _is_all_finite(grads)
+        # We cast to float, because we cannot reduce booleans with
+        # DistributionStrategy.
+        return math_ops.cast(is_finite, dtypes.float32)
+
+      is_finite_float = distribution.extended.call_for_each_replica(
+          get_is_finite, args=(grads,))
+      reduced_is_finite_float = distribution.reduce(reduce_util.ReduceOp.SUM,
+                                                    is_finite_float, axis=None)
+      is_finite = math_ops.equal(reduced_is_finite_float,
+                                 distribution.num_replicas_in_sync)
+    else:
+      is_finite = _is_all_finite(grads)
+
+    def update_if_finite_grads():
+      """Update assuming the gradients are finite."""
+
+      def incr_loss_scale():
+        new_loss_scale = self._current_loss_scale * self._multiplier
+        return control_flow_ops.group(
+            _assign_if_finite(self._current_loss_scale, new_loss_scale),
+            self._num_good_steps.assign(0))
+
+      return control_flow_ops.cond(
+          self._num_good_steps + 1 >= self._increment_period,
+          incr_loss_scale, lambda: _op_in_graph_mode(
+              self._num_good_steps.assign_add(1)))
+
+    def update_if_not_finite_grads():
+      """Update assuming the gradients are nonfinite."""
+
+      new_loss_scale = math_ops.maximum(
+          self._current_loss_scale / self._multiplier, 1)
+      return control_flow_ops.group(
+          self._num_good_steps.assign(0),
+          self._current_loss_scale.assign(new_loss_scale))
+
+    update_op = control_flow_ops.cond(is_finite, update_if_finite_grads,
+                                      update_if_not_finite_grads)
+    should_apply_gradients = is_finite
+    return update_op, should_apply_gradients
+
+
+def get(identifier):
+  """Get a loss scale object."""
+  if isinstance(identifier, six.integer_types + (float,)):
+    return FixedLossScale(identifier)
+  if identifier == 'dynamic':
+    return DynamicLossScale()
+  if isinstance(identifier, LossScale):
+    return identifier
+  elif identifier is None:
+    return None
+  else:
+    raise ValueError('Could not interpret loss scale identifier: %s' %
+                     identifier)
--- a/tensorflow/python/training/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/training/experimental/loss_scale_optimizer.py
@ -0,0 +1,237 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains LossScale classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training.experimental import loss_scale as loss_scale_module
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export(v1=['train.experimental.MixedPrecisionLossScaleOptimizer'])
+class MixedPrecisionLossScaleOptimizer(optimizer.Optimizer):
+  """An optimizer that applies loss scaling.
+
+  Loss scaling is a process that multiplies the loss by a multiplier called the
+  loss scale, and divides each gradient by the same multiplier. The pseudocode
+  for this process is:
+
+  ```
+  loss = ...
+  loss *= loss_scale
+  grads = gradients(loss, vars)
+  grads /= loss_scale
+  ```
+
+  Mathematically, loss scaling has no effect, but can help avoid numerical
+  underflow in intermediate gradients when float16 tensors are used for mixed
+  precision training. By multiplying the loss, each intermediate gradient will
+  have the same multiplier applied.
+
+  The loss scale can either be a fixed constant, chosen by the user, or be
+  dynamically determined. Dynamically determining the loss scale is convenient
+  as a loss scale does not have to be explicitly chosen. However it reduces
+  performance.
+
+  This optimizer wraps another optimizer and applies loss scaling to it via a
+  `LossScale`. Loss scaling is applied whenever gradients are
+  computed, such as through `minimize()`.
+  """
+
+  def __init__(self, opt, loss_scale):
+    if not isinstance(opt, optimizer.Optimizer):
+      raise ValueError('"opt" must be an instance of Optimizer, but got: %s' %
+                       type(opt))
+    self._optimizer = opt
+
+    use_locking = opt._use_locking  # pylint: disable=protected-access
+    name = opt.get_name()
+    super(MixedPrecisionLossScaleOptimizer, self).__init__(use_locking, name)
+
+    self._loss_scale = loss_scale_module.get(loss_scale)
+    self._track_trackable(self._optimizer, 'base_optimizer')
+    self._track_trackable(self._loss_scale, 'loss_scale')
+
+  def _doing_dynamic_loss_scaling(self):
+    """Check if `_loss_scale` dynamically manages the loss scale."""
+    return isinstance(self._loss_scale, loss_scale_module.DynamicLossScale)
+
+  def compute_gradients(self,
+                        loss,
+                        var_list=None,
+                        gate_gradients=optimizer.Optimizer.GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+
+    This adjusts the dynamic range of the gradient evaluation by scaling up
+    the `loss` value. The gradient values are then scaled back down by the
+    recipricol of the loss scale. This is useful in reduced precision training
+    where small gradient values would otherwise underflow the representable
+    range.
+
+    Args:
+      loss: A Tensor containing the value to minimize or a callable taking no
+        arguments which returns the value to minimize. When eager execution is
+        enabled it must be a callable.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph under
+        the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with the
+        corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+    """
+    loss = self._scale_loss(loss)
+    grads_and_vars = self._optimizer.compute_gradients(
+        loss=loss,
+        var_list=var_list,
+        gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss)
+
+    grads = [g for g, _ in grads_and_vars]
+    variables = [v for _, v in grads_and_vars]
+    scaled_grads = self._scale_grads(grads)
+    return list(zip(scaled_grads, variables))
+
+  def _scale_loss(self, loss):
+    loss_scale = self._loss_scale()
+    if callable(loss):
+      return lambda: loss() * loss_scale
+    return loss * loss_scale
+
+  def _scale_grads(self, grads):
+    loss_scale = self._loss_scale()
+    loss_scale_reciprical = 1 / loss_scale
+    return [
+        None if g is None else self._scale_grad(g, loss_scale_reciprical)
+        for g in grads
+    ]
+
+  def _scale_grad(self, grad, loss_scale_reciprical):
+    if isinstance(grad, ops.IndexedSlices):
+      grad_vals = grad.values * loss_scale_reciprical
+      return ops.IndexedSlices(grad_vals, grad.indices, grad.dense_shape)
+    return grad * loss_scale_reciprical
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    conditionally applies gradients if all gradient values are finite.
+    Otherwise no update is performed (nor is `global_step` incremented).
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the variables
+        have been updated.
+      name: Optional name for the returned operation.  Default to the name
+        passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that conditionally applies the specified gradients. If
+      `global_step` was not None, that operation also increments `global_step`.
+
+    Raises:
+      RuntimeError: If you should use `_distributed_apply()` instead.
+    """
+    if distribution_strategy_context.in_cross_replica_context():
+      raise ValueError('apply_gradients() must be called in a replica context.')
+
+    if not self._doing_dynamic_loss_scaling():
+      return self._optimizer.apply_gradients(grads_and_vars, global_step, name)
+
+    replica_context = distribution_strategy_context.get_replica_context()
+
+    # TODO(nluehr) cleanup GraphKeys.TRAIN_OP
+    return replica_context.merge_call(
+        self._distributed_apply, args=(grads_and_vars, global_step, name))
+
+  def _distributed_apply(self,
+                         distribution,
+                         grads_and_vars,
+                         global_step=None,
+                         name=None):
+    """A version of `apply_gradients` for cross replica context.
+
+    When users are in a cross replica strategy, they must call this rather than
+    `apply_gradients()`.
+
+    Args:
+      distribution: a `DistributionStrategy` object.
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()` and then aggregated across replicas.
+      global_step: Optional (mirrored) `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation. Default to the name passed
+        to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients across all
+      replicas. If `global_step` was not None, that operation also
+      increments `global_step`
+    """
+    name = name if name is not None else self.get_name()
+    grads = [g for g, _ in grads_and_vars]
+    loss_scale_update_op, should_apply_grads = (self._loss_scale.update(grads))
+
+    def apply_fn():
+      return self._apply_gradients(distribution, grads_and_vars, global_step,
+                                   name + '-wrapped')
+
+    maybe_apply_op = smart_cond.smart_cond(should_apply_grads, apply_fn,
+                                           control_flow_ops.no_op)
+    return control_flow_ops.group(
+        maybe_apply_op, loss_scale_update_op, name=name)
+
+  def _apply_gradients(self, distribution, grads_and_vars, global_step, name):
+    """Unconditionally apply gradients in cross replica context."""
+    update_ops = distribution.extended.call_for_each_replica(
+        self._optimizer.apply_gradients,
+        args=(grads_and_vars, global_step, name))
+    return distribution.group(update_ops)
+
+  def _apply_sparse(self, grad, var):
+    """This function should never be called."""
+    raise RuntimeError('This function should never be called')
+
+  def _apply_dense(self, grad, var):
+    """This function should never be called."""
+    raise RuntimeError('This function should never be called')
+
+  def _resource_apply_sparse(self, grad, handle, indices):
+    """This function should never be called."""
+    raise RuntimeError('This function should never be called')
+
+  def _resource_apply_dense(self, grad, handle):
+    """This function should never be called."""
+    raise RuntimeError('This function should never be called')
--- a/tensorflow/python/training/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/training/experimental/loss_scale_optimizer_test.py
@ -0,0 +1,266 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MixedPrecisionLossScaleOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import momentum
+from tensorflow.python.training.experimental import loss_scale as loss_scale_module
+from tensorflow.python.training.experimental import loss_scale_optimizer
+from tensorflow.python.training.tracking import util as trackable_utils
+
+# If called outside any strategy.scope() calls, this will return the default
+# strategy.
+default_strategy_fn = distribution_strategy_context.get_strategy
+
+
+def create_mirrored_strategy():
+  if context.num_gpus() >= 1:
+    return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0'])
+  else:
+    return mirrored_strategy.MirroredStrategy(['cpu:0'])
+
+
+TESTCASES = ({
+    'testcase_name': 'Base',
+    'strategy_fn': default_strategy_fn
+}, {
+    'testcase_name': 'Distribute',
+    'strategy_fn': create_mirrored_strategy
+})
+
+
+def get_gradients(opt, loss, params):
+  grads_and_vars = opt.compute_gradients(loss, params)
+  grads, _ = zip(*grads_and_vars)
+  return grads
+
+
+class MixedPrecisionLossScaleOptimizerTest(test.TestCase,
+                                           parameterized.TestCase):
+
+  def _run_if_in_graph_mode(self, val):
+    # Running only in graph mode is useful, because optimizers sometimes return
+    # a value that, in Graph mode, is runnable with self.evaluate. But in Eager
+    # mode, the optimizer already does the computations and the return value
+    # cannot be run.
+    if not context.executing_eagerly():
+      self.evaluate(val)
+
+  def _run_fn_with_grad_check(self, strategy, var, opt, expected_grad):
+    grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(
+        expected_grad)
+    loss = lambda: grad_check_fn(var) / strategy.num_replicas_in_sync
+    return lambda: opt.minimize(loss, var_list=[var])
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def testFixedLossScaleAppliedToLossWithMinimize(self, strategy_fn):
+    with strategy_fn().scope() as strategy:
+      var = variables.Variable([5.0])
+      opt = gradient_descent.GradientDescentOptimizer(2.0)
+      loss_scale = 10.
+      opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(
+          opt, loss_scale)
+      # We need num_replicas_in_sync to divide loss_scale, otherwise loss_scale
+      # / strategy.num_replicas_in_sync will not be exact, which could lead to
+      # assertion failures due to rounding issues.
+      self.assertEqual(loss_scale % strategy.num_replicas_in_sync, 0)
+      run_fn = self._run_fn_with_grad_check(
+          strategy, var, opt, loss_scale / strategy.num_replicas_in_sync)
+      run_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
+      # The loss is the identity of the variable. Therefore the gradient is 1,
+      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
+      self.assertAllClose([3.], self.evaluate(var))
+
+  @test_util.deprecated_graph_mode_only
+  def testFixedLossScaleAppliedToLossWithGetGradients(self):
+    var = variables.Variable([2.0])
+    opt = gradient_descent.GradientDescentOptimizer(1.0)
+    loss_scale = 10.
+    opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(opt, loss_scale)
+    grad_check_fn = mp_test_util.create_identity_with_grad_check_fn(loss_scale)
+    loss = grad_check_fn(var)
+    run_op = get_gradients(opt, loss, [var])
+    self.evaluate(variables.global_variables_initializer())
+    # This will cause an assertion to run, as
+    # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
+    self.evaluate(run_op)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def testDynamicLossScale(self, strategy_fn):
+    strategy = strategy_fn()
+    learning_rate = 2.
+    expected_gradient = resource_variable_ops.ResourceVariable(
+        learning_rate / strategy.num_replicas_in_sync)
+    with strategy.scope():
+      var = variables.Variable([5.0])
+      opt = gradient_descent.GradientDescentOptimizer(learning_rate)
+      loss_scale = loss_scale_module.DynamicLossScale(
+          initial_loss_scale=2, increment_period=1, multiplier=2)
+      opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(
+          opt, loss_scale)
+      self.assertEqual(
+          loss_scale.initial_loss_scale % strategy.num_replicas_in_sync, 0)
+
+      run_fn = self._run_fn_with_grad_check(strategy, var, opt,
+                                            expected_gradient)
+      run_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
+      # The loss is the identity of the variable. Therefore the gradient is 1,
+      # and so the variable will be init_val - grad * lr == 5 - 1 * 2 == 3
+      self.assertAllClose([3.], self.evaluate(var))
+
+      # Loss scale will be double, so the expected gradient is also doubled.
+      self.evaluate(
+          expected_gradient.assign(2 * learning_rate /
+                                   strategy.num_replicas_in_sync))
+      run_op = strategy.experimental_run(run_fn)
+      self._run_if_in_graph_mode(run_op)
+      # As before, the 2 is subtracted from the variable, making it's new value
+      # 1.
+      self.assertAllClose([1.], self.evaluate(var))
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def testDynamicUpdate(self, strategy_fn):
+    with strategy_fn().scope() as strategy:
+      var = variables.Variable([1.0, 2.0])
+      opt = gradient_descent.GradientDescentOptimizer(1.0)
+      loss_scale = loss_scale_module.DynamicLossScale(
+          initial_loss_scale=2, increment_period=1, multiplier=2)
+      opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(
+          opt, loss_scale)
+
+      # Test optimizer with finite gradients
+      loss = lambda: var * 2.0 / strategy.num_replicas_in_sync
+      run_fn = lambda: opt.minimize(loss, var_list=[var])
+      run_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
+      # Gradient is 2, so variable will have 2 subtracted from it
+      self.assertAllClose([-1.0, 0.0], self.evaluate(var))
+      # Loss scale has doubled from 2 to 4
+      self.assertEqual(4., self.evaluate(opt._loss_scale()))
+
+      # Test optimizer with NaN gradients
+      loss = lambda: var * float('NaN')
+      run_fn = lambda: opt.minimize(loss, var_list=[var])
+      run_op = strategy.experimental_run(run_fn)
+      self._run_if_in_graph_mode(run_op)
+      # Variable should not change from before, due to NaN gradients.
+      self.assertAllClose(self.evaluate(var), [-1.0, 0.0])
+      # Loss scale should half due to NaN gradients.
+      self.assertEqual(2., self.evaluate(opt._loss_scale()))
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def testDynamicLossScaleWithSlots(self, strategy_fn):
+    with strategy_fn().scope() as strategy:
+      var = variables.Variable([1.0, 2.0])
+      # An SGD optimizer with momentum has slot variables.
+      opt = momentum.MomentumOptimizer(1.0, momentum=1.)
+      initial_loss_scale = 2.
+      loss_scale = loss_scale_module.DynamicLossScale(
+          initial_loss_scale=initial_loss_scale,
+          increment_period=1,
+          multiplier=4)
+      opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(
+          opt, loss_scale)
+      loss = lambda: var / strategy.num_replicas_in_sync
+      run_fn = lambda: opt.minimize(loss, var_list=[var])
+      run_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
+      # The momentum accumulator starts at 0 and the gradient is 1. The
+      # accumulator is incremented by the gradient, so it is now 1. Then the
+      # variable is subtracted by the accumulator, so the variable is subtracted
+      # by 1.
+      self.assertAllClose([0.0, 1.0], self.evaluate(var))
+      self.assertEqual(self.evaluate(opt._loss_scale()), initial_loss_scale * 4)
+
+      run_op = strategy.experimental_run(run_fn)
+      self._run_if_in_graph_mode(run_op)
+      # The momentum accumulator was 1 before this step and the gradient is 1.
+      # The accumulator is incremented by the gradient, so it is now 2. Then the
+      # variable is subtracted by the accumulator, so the variable is subtracted
+      # by 2.
+      self.assertAllClose([-2., -1.], self.evaluate(var))
+      self.assertEqual(
+          self.evaluate(opt._loss_scale()), initial_loss_scale * 16)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def testCheckpoint(self, strategy_fn):
+    strategy = strategy_fn()
+    if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and
+        not context.executing_eagerly()):
+      # TODO(b/121381184): Enable running the test in this case.
+      return
+
+    with self.test_session(), strategy.scope():
+      # Build and run a simple model.
+      var = variables.Variable([2.0])
+      loss_scale = loss_scale_module.DynamicLossScale(
+          initial_loss_scale=1., increment_period=2., multiplier=2.)
+      opt = momentum.MomentumOptimizer(1.0, momentum=1.)
+      opt = loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(
+          opt, loss_scale)
+      run_fn = lambda: opt.minimize(lambda: var + 1., var_list=[var])
+      opt_op = strategy.experimental_run(run_fn)
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+      self.assertEqual(self.evaluate(loss_scale()), 1.)
+      self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
+
+      # Save a checkpoint.
+      checkpoint = trackable_utils.Checkpoint(optimizer=opt)
+      prefix = os.path.join(self.get_temp_dir(), 'ckpt')
+      save_path = checkpoint.save(prefix)
+
+      # Run model again.
+      self.evaluate(strategy.experimental_run(run_fn))
+      self.assertEqual(self.evaluate(loss_scale()), 2.)
+      self.assertEqual(self.evaluate(loss_scale._num_good_steps), 0)
+
+      # Load checkpoint and ensure loss scale is back to it's original value.
+      status = checkpoint.restore(save_path)
+      status.assert_consumed()
+      status.run_restore_ops()
+      self.assertEqual(self.evaluate(loss_scale()), 1.)
+      self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
+
+
+if __name__ == '__main__':
+  test.main()
--- a/tensorflow/python/training/experimental/loss_scale_test.py
+++ b/tensorflow/python/training/experimental/loss_scale_test.py
@ -0,0 +1,265 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for LossScale classes.."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training.experimental import loss_scale as loss_scale_module
+
+# TODO(reedwm): Create test case using multiple graphs
+
+# If called outside any strategy.scope() calls, this will return the default
+# strategy.
+default_strategy_fn = distribution_strategy_context.get_strategy
+
+
+def create_mirrored_strategy():
+  if context.num_gpus() >= 1:
+    return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0'])
+  else:
+    return mirrored_strategy.MirroredStrategy(['cpu:0'])
+
+
+TESTCASES = ({
+    'testcase_name': 'base',
+    'strategy_fn': default_strategy_fn
+}, {
+    'testcase_name': 'distribute',
+    'strategy_fn': create_mirrored_strategy
+})
+
+
+class FixedLossScaleTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_basic(self):
+    loss_scale_value = 1000
+    loss_scale = loss_scale_module.FixedLossScale(loss_scale_value)
+
+    update_op, should_apply = loss_scale.update([constant_op.constant(0.)])
+    self.evaluate(update_op)
+    # should_apply should be a bool instead of a tensor, so that a tf.cond does
+    # not have to be built in the graph by the caller.
+    self.assertIsInstance(should_apply, bool)
+    self.assertTrue(should_apply)
+    self.assertEqual(loss_scale_value, self.evaluate(loss_scale()))
+
+    update_op, should_apply = loss_scale.update(
+        [constant_op.constant(float('NaN'))])
+    self.evaluate(update_op)
+    self.assertIsInstance(should_apply, bool)
+    self.assertTrue(should_apply)
+    self.assertEqual(loss_scale_value, self.evaluate(loss_scale()))
+
+
+def _get_example_iter(inputs):
+  dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
+  return dataset_ops.make_one_shot_iterator(dataset)
+
+
+class DynamicLossScaleTest(test.TestCase, parameterized.TestCase):
+
+  def _get_tensor(self, is_finite):
+    tensor = control_flow_ops.cond(is_finite, lambda: 1., lambda: float('NaN'))
+
+    if not distribution_strategy_context.has_strategy():
+      return tensor
+
+    def get():
+      rep_id = (
+          distribution_strategy_context.get_replica_context()
+          .replica_id_in_sync_group)
+      return control_flow_ops.cond(
+          math_ops.equal(rep_id, 0), lambda: tensor, lambda: 1.)
+
+    distribution = distribution_strategy_context.get_strategy()
+    return distribution.extended.call_for_each_replica(get)
+
+  def _test_helper(self,
+                   inputs,
+                   expected_outputs,
+                   initial_loss_scale=1.,
+                   increment_period=2,
+                   multiplier=2):
+    loss_scale = loss_scale_module.DynamicLossScale(
+        initial_loss_scale=initial_loss_scale,
+        increment_period=increment_period,
+        multiplier=multiplier)
+    itr = _get_example_iter(inputs)
+
+    def update():
+      is_finite = itr.get_next()
+      grad = self._get_tensor(is_finite)
+      update_op, should_apply_gradients = loss_scale.update([grad])
+      assert_op = check_ops.assert_equal(should_apply_gradients, is_finite)
+      if context.executing_eagerly():
+        return
+      with ops.control_dependencies([assert_op]):
+        return array_ops.identity(update_op)
+
+    actual_outputs = []
+
+    if not context.executing_eagerly():
+      update_op = update()
+      self.evaluate(variables.global_variables_initializer())
+    for _ in range(len(inputs)):
+      if context.executing_eagerly():
+        update()
+      else:
+        self.evaluate(update_op)
+      actual_outputs.append(self.evaluate(loss_scale()))
+    self.assertEqual(actual_outputs, expected_outputs)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_increase(self, strategy_fn):
+    with strategy_fn().scope():
+      inputs = [True] * 6
+      expected_outputs = [1, 2, 2, 4, 4, 8]
+      self._test_helper(inputs, expected_outputs)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_keep_increasing_until_capped(self, strategy_fn):
+    with strategy_fn().scope():
+      init_loss_scale = np.finfo(np.float32).max / 4
+      max_float = np.finfo(np.float32).max
+
+      inputs = [True] * 6
+      # Output is capped the 2nd time it doubles.
+      expected_outputs = [
+          init_loss_scale, init_loss_scale * 2, init_loss_scale * 2, max_float,
+          max_float, max_float
+      ]
+
+      self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_decrease_every_step(self, strategy_fn):
+    with strategy_fn().scope():
+      inputs = [False] * 6
+      init_loss_scale = 1024
+      expected_outputs = [512, 256, 128, 64, 32, 16]
+
+    self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_keep_decreasing_until_one(self, strategy_fn):
+    with strategy_fn().scope():
+      inputs = [False] * 6
+      init_loss_scale = 16
+      expected_outputs = [8, 4, 2, 1, 1, 1]
+
+      self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_nan_clear_good_step(self, strategy_fn):
+    with strategy_fn().scope():
+      inputs = [True, True, True, False, True]
+      expected_outputs = [1, 2, 2, 1, 1]
+      self._test_helper(inputs, expected_outputs)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_trigger_loss_scale_update_each_step(self, strategy_fn):
+    with strategy_fn().scope():
+      init_loss_scale = 1
+      increment_period = 1
+
+      inputs = [True] * 3 + [False, True, True]
+      expected_outputs = [2, 4, 8, 4, 8, 16]
+
+      self._test_helper(inputs, expected_outputs, init_loss_scale,
+                        increment_period)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_alternating_good_and_bad_gradients_trigger_each_step(
+      self, strategy_fn):
+    with strategy_fn().scope():
+      init_loss_scale = 1
+      increment_period = 1
+
+      inputs = [True, False] * 4 + [True]
+      expected_outputs = [2, 1, 2, 1, 2, 1, 2, 1, 2]
+      self._test_helper(inputs, expected_outputs, init_loss_scale,
+                        increment_period)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_alternating_good_and_bad_gradients_trigger_every_other_step(
+      self, strategy_fn):
+    with strategy_fn().scope():
+      init_loss_scale = 32
+      increment_period = 2
+
+      inputs = [True, False] * 3 + [True]
+      expected_outputs = [32, 16, 16, 8, 8, 4, 4]
+      self._test_helper(inputs, expected_outputs, init_loss_scale,
+                        increment_period)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_nondefault_multiplier(self, strategy_fn):
+    with strategy_fn().scope():
+      init_loss_scale = 4
+      multiplier = 3
+      inputs = [True, True, False, True, True]
+      expected_outputs = [4, 12, 4, 4, 12]
+      self._test_helper(
+          inputs, expected_outputs, init_loss_scale, multiplier=multiplier)
+
+  @parameterized.named_parameters(*TESTCASES)
+  @test_util.run_in_graph_and_eager_modes
+  def test_random_mix_good_and_bad_gradients(self, strategy_fn):
+    with strategy_fn().scope():
+      init_loss_scale = 4
+      inputs = [
+          False, True, True, True, False, True, False, True, True, True, False
+      ]
+      expected_outputs = [2, 2, 4, 4, 2, 2, 1, 1, 2, 2, 1]
+      self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_get(self):
+    scalar = loss_scale_module.get('dynamic')
+    scalar2 = loss_scale_module.DynamicLossScale()
+    self.assertEqual(scalar.initial_loss_scale, scalar2.initial_loss_scale)
+    self.assertEqual(scalar.increment_period, scalar2.increment_period)
+    self.assertEqual(scalar.multiplier, scalar2.multiplier)
+
+
+if __name__ == '__main__':
+  test.main()
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@ -33,6 +33,7 @@ from tensorflow.python.training.adagrad_da import AdagradDAOptimizer
 from tensorflow.python.training.proximal_adagrad import ProximalAdagradOptimizer
 from tensorflow.python.training.adam import AdamOptimizer
 from tensorflow.python.training.ftrl import FtrlOptimizer
+from tensorflow.python.training.experimental.loss_scale_optimizer import MixedPrecisionLossScaleOptimizer
 from tensorflow.python.training.momentum import MomentumOptimizer
 from tensorflow.python.training.moving_averages import ExponentialMovingAverage
 from tensorflow.python.training.optimizer import Optimizer
@ -143,4 +144,3 @@ tf_export(v1=["train.SaverDef"])(SaverDef)
 tf_export("train.SequenceExample")(SequenceExample)
 tf_export("train.ServerDef")(ServerDef)
 # pylint: enable=undefined-variable
-
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-mixed-precision-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.-mixed-precision-loss-scale-optimizer.pbtxt
@ -0,0 +1,51 @@
+path: "tensorflow.train.experimental.MixedPrecisionLossScaleOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.experimental.loss_scale_optimizer.MixedPrecisionLossScaleOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'opt\', \'loss_scale\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.experimental.pbtxt
@ -1,5 +1,9 @@
 path: "tensorflow.train.experimental"
 tf_module {
+  member {
+    name: "MixedPrecisionLossScaleOptimizer"
+    mtype: "<type \'type\'>"
+  }
  member {
    name: "PythonState"
    mtype: "<type \'type\'>"
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@ -1387,6 +1387,8 @@ renames = {
        'tf.compat.v1.train.create_global_step',
    'tf.train.do_quantize_training_on_graphdef':
        'tf.compat.v1.train.do_quantize_training_on_graphdef',
+    'tf.train.experimental.MixedPrecisionLossScaleOptimizer':
+        'tf.compat.v1.train.experimental.MixedPrecisionLossScaleOptimizer',
    'tf.train.exponential_decay':
        'tf.compat.v1.train.exponential_decay',
    'tf.train.export_meta_graph':
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@ -68,6 +68,8 @@ COMMON_PIP_DEPS = [
    "//tensorflow/python/compiler:compiler",
    "//tensorflow/python:cond_v2",
    "//tensorflow/python:distributed_framework_test_lib",
+    "//tensorflow/python:loss_scale",
+    "//tensorflow/python:loss_scale_optimizer",
    "//tensorflow/python:meta_graph_testdata",
    "//tensorflow/python:spectral_ops_test_util",
    "//tensorflow/python:util_example_parser_configuration",