Merge pull request #46349 from geetachavan1/cherrypicks_ZX1AI

[Cherrypick:r2.4] Fix issue when using mixed precision with RMSprop.
2021-01-19 09:21:17 -08:00 · 2021-01-19 09:21:17 -08:00 · e9c0ef3064
commit e9c0ef3064
parent 27037383f0 ef97e90d60
2 changed files with 81 additions and 21 deletions
--- a/tensorflow/python/keras/mixed_precision/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/autocast_variable.py
@ -57,12 +57,11 @@ class AutoCastVariable(variables.Variable, core.Tensor):
  called.
  """

-  def __init__(self, variable, op=None):
+  def __init__(self, variable):
    """Creates an AutoCastVariable instance.

    Args:
      variable: A floating-point resource variable to wrap.
-      op: Optional operation of this variable.

    Raises:
      ValueError: If `variable` is not a floating-point resource variable
@ -74,7 +73,11 @@ class AutoCastVariable(variables.Variable, core.Tensor):
      raise ValueError('variable must be a floating point variable but has '
                       'type: %s' % variable.dtype.name)
    self._variable = variable
-    self._op = op
+    # 'delegate' means AutoCastVariable.op return self._variable.op, which will
+    # raise an AttributeError in Eager (as intended). If set to any other value,
+    # AutoCastVariable.op returns that value instead, which is used to set the
+    # op attribute in AutoCastVariable.assign().
+    self._op = 'delegate'

  def _should_cast(self):
    """Returns True if this variable should be casted when accessed."""
@ -199,10 +202,18 @@ class AutoCastVariable(variables.Variable, core.Tensor):
                           use_locking=None,
                           name=None,
                           read_value=True):
+    # TODO(b/146181571): This logic can be simplified once
+    # DistributedVariable.assign returns a DistributedVariable. Currently for
+    # MirroredStrategy, it returns a Mirrored value.
    if ops.executing_eagerly_outside_functions():
      assign_op = update_fn(value, use_locking, name, False)
      if read_value:
-        return create_autocast_variable(self._variable, op=assign_op)
+        # We create a new AutoCastVariable with the same underlying tf.Variable.
+        # The new AutoCastVariable is identical except the 'op' attribute is
+        # defined. This matches the behavior of tf.Variable.assign.
+        var = create_autocast_variable(self._variable)
+        var._op = assign_op  # pylint:disable=protected-access
+        return var
      return assign_op

    # Fallback to wrapping the returned variable in graph mode if possible
@ -298,9 +309,9 @@ class AutoCastVariable(variables.Variable, core.Tensor):

  @property
  def op(self):
-    if self._op is not None:
-      return self._op
-    return self._variable.op
+    if self._op == 'delegate':
+      return self._variable.op
+    return self._op

  def _as_graph_element(self):
    graph_element = self._variable._as_graph_element()  # pylint:disable=protected-access
@ -469,7 +480,7 @@ ops.register_tensor_conversion_function(AutoCastVariable,
                                        AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access


-def create_autocast_variable(variable, op=None):
+def create_autocast_variable(variable):
  """Creates an AutoCastVariable that wraps another variable.

  This typically just returns `AutoCastVariable(variable)`. But, if the variable
@ -481,14 +492,13 @@ def create_autocast_variable(variable, op=None):

  Args:
    variable: A floating-point resource variable to wrap.
-    op: Optional operation of this variable.

  Returns:
    An AutoCastVariable that wraps the variable.
  """
  if not isinstance(variable, (distribute_values.DistributedVariable,
                               ps_distribute_values.AggregatingVariable)):
-    return AutoCastVariable(variable, op=op)
+    return AutoCastVariable(variable)

  class AutoCastDistributedVariable(AutoCastVariable, variable.__class__):
    """An AutoCastVariable that also subclasses from variable.__class__.
@ -511,7 +521,7 @@ def create_autocast_variable(variable, op=None):
             ).format(v=self)
      # pylint: enable=missing-format-attribute

-  return AutoCastDistributedVariable(variable, op=op)
+  return AutoCastDistributedVariable(variable)


 class enable_auto_cast_variables(object):  # pylint:disable=invalid-name
--- a/tensorflow/python/keras/mixed_precision/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/autocast_variable_test.py
@ -37,7 +37,14 @@ from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.mixed_precision import autocast_variable
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import adamax
+from tensorflow.python.keras.optimizer_v2 import ftrl
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.keras.optimizer_v2 import nadam
+from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@ -352,11 +359,28 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
        self.assertAllClose(5., self.evaluate(run_assign()))

  @ds_combinations.generate(maybe_distribute)
-  def test_assign_op(self, distribution):
+  def test_op_attribute(self, distribution):
    with distribution.scope():
      x = get_var(0., dtypes.float32)
      x = autocast_variable.create_autocast_variable(x)

+      # Variable.op raises an AttributeError in Eager mode and is an op in graph
+      # mode. Variable.assign(...).op is None in Eager mode and an op in Graph
+      # mode or a tf.function. We test this is also true of AutoCastVariable.
+      if context.executing_eagerly():
+        with self.assertRaisesRegex(
+            AttributeError,
+            'Tensor.op is meaningless when eager execution is enabled'):
+          x.op  # pylint: disable=pointless-statement
+        self.assertIsNone(x.assign(1.0).op)
+        self.assertIsNone(x.assign_add(1.0).op)
+        self.assertIsNone(x.assign_sub(1.0).op)
+      else:
+        self.assertIsNotNone(x.op)
+        self.assertIsNotNone(x.assign(1.0).op)
+        self.assertIsNotNone(x.assign_add(1.0).op)
+        self.assertIsNotNone(x.assign_sub(1.0).op)
+
      @def_function.function
      def func():
        self.assertIsNotNone(x.assign(1.0).op)
@ -503,25 +527,51 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
            'dtype_to_cast_to=float32 '
            'inner_variable=MirroredVariable.*>')

-  @parameterized.named_parameters(
-      ('v1', gradient_descent_v1.GradientDescentOptimizer),
-      ('v2', gradient_descent_v2.SGD))
-  def test_optimizer(self, optimizer_class):
+  @ds_combinations.generate(combinations.combine(
+      optimizer_class=[
+          adadelta.Adadelta,
+          adagrad.Adagrad,
+          adam.Adam,
+          adamax.Adamax,
+          ftrl.Ftrl,
+          gradient_descent_v2.SGD,
+          nadam.Nadam,
+          rmsprop.RMSprop,
+          gradient_descent_v1.GradientDescentOptimizer
+      ],
+      use_tf_function=[False, True]))
+  def test_optimizer(self, optimizer_class, use_tf_function):
+    if use_tf_function and not context.executing_eagerly():
+      self.skipTest('Test does not support graph mode with tf.function')
    x = get_var(1., dtypes.float32)
    x = autocast_variable.create_autocast_variable(x)
-    opt = optimizer_class(1.)
+    y = get_var(1., dtypes.float32)
+    opt = optimizer_class(learning_rate=1.)

-    @def_function.function
    def f():
-      opt.minimize(lambda: x + 1., var_list=[x])
+      # Minimize both the AutoCastVariable and the normal tf.Variable. Both
+      # variables should be updated to the same value.
+      op = opt.minimize(lambda: x + y, var_list=[x, y])
+      return None if ops.executing_eagerly_outside_functions() else op
+
+    if use_tf_function:
+      f = def_function.function(f)

    if context.executing_eagerly():
      f()
    else:
-      op = f()  # pylint: disable=assignment-from-no-return
+      op = f()
      self.evaluate(variables.global_variables_initializer())
      self.evaluate(op)
-    self.assertEqual(self.evaluate(x), 0)
+    # Assert the AutoCastVariable has changed from its initial value
+    self.assertNotEqual(self.evaluate(x), 1.)
+    # Assert AutoCastVariable is updated correctly by comparing it to the normal
+    # variable
+    self.assertAlmostEqual(self.evaluate(x), self.evaluate(y))
+    if optimizer_class in (gradient_descent_v2.SGD,
+                           gradient_descent_v1.GradientDescentOptimizer):
+      # With SGD, the variables decreases by exactly 1
+      self.assertEqual(self.evaluate(x), 0)


 if __name__ == '__main__':