From 1427bfc12ec5a3a2c6a4ffd57fc5b465d3eedfae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 19 Nov 2020 12:41:29 -0800
Subject: [PATCH] Update gradient_checker_v2 to use a step size in the finite
 difference approximation that is exactly representable as a binary floating
 point number. This is an old trick that in some cases avoids polluting the
 finite difference approximation with rounding errors that cause false
 negatives in gradient tests.

PiperOrigin-RevId: 343348502
Change-Id: I3539ae7de7105177c5a1b9144b491f36369344f4
---
 RELEASE.md                                    |  1 +
 .../python/kernel_tests/relu_op_test.py       | 53 +++++--------------
 tensorflow/python/ops/gradient_checker_v2.py  |  8 ++-
 .../tools/api/golden/v2/tensorflow.test.pbtxt |  2 +-
 4 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 50b392dc091..9ccef55583a 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -54,6 +54,7 @@
     *   Corrected higher-order gradients of control flow constructs (`tf.cond`,
         `tf.while_loop`, and compositions like `tf.foldl`) computed with
         `tf.GradientTape` inside a `tf.function`.
+    *   Changed the default step size in `gradient_checker_v2.compute_gradients` to be exactly representable as a binary floating point numbers. This avoids poluting gradient approximations needlessly, which is some cases leads to false negatives in op gradient tests.
 
 *   `tf.summary`:
   *   New `tf.summary.graph` allows manual write of TensorFlow graph
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 95483881629..81f105899f3 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -19,9 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.python import tf2
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -29,7 +27,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker_v2
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -117,45 +114,19 @@ class ReluTest(test.TestCase):
           order="F")
       err = gradient_checker_v2.max_error(*gradient_checker_v2.compute_gradient(
           nn_ops.relu, [x], delta=1.0 / 1024))
-    self.assertLess(err, 1e-4)
+    self.assertLess(err, 1e-6)
 
-  # The gradient for fp16 is inaccurate due to the low-precision.
-  # We compare the fp16 analytical gradient against their fp32 counterpart.
+  # The gradient test for ReLU is a bit tricky as the derivative is not well
+  # defined at around zero and we want to avoid that in terms of input values.
   def testGradientFloat16(self):
-
-    def grad(x):
-      with backprop.GradientTape() as tape:
-        tape.watch(x)
-        y = nn_ops.l2_loss(nn_ops.relu(x))
-      return tape.gradient(y, x)
-
-    def f():
-      with test_util.use_gpu():
-        # Randomly construct a 1D shape from [1, 40)
-        shape = random_ops.random_uniform([1],
-                                          minval=1,
-                                          maxval=40,
-                                          dtype=dtypes.int32)
-        x32 = random_ops.random_uniform(shape, minval=-1, maxval=1)
-        x16 = math_ops.cast(x32, dtype=dtypes.float16)
-        return grad(x32), grad(x16)
-
-    # We're going to ensure that the fp16 and fp32 gradients
-    # are "close" to each other for ~100 random values.
-    #
-    # In TensorFlow 1.x, invoking f() (without eager execution enabled)
-    # would construct a graph. Instead of construct a graph with O(100) nodes,
-    # we construct a single graph to be executed ~100 times in a Session.
-    if not tf2.enabled():
-      d32_tensor, d16_tensor = f()
-      with self.cached_session() as sess:
-        f = lambda: sess.run([d32_tensor, d16_tensor])
-
-    # Repeat the experiment for 100 times. All tensor shapes and its tensor
-    # values are randomly generated for each run.
-    for _ in xrange(100):
-      d32, d16 = f()
-      self.assertAllClose(d32, d16, atol=3e-4)
+    with self.cached_session():
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float16,
+          order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu, [x]))
+    self.assertLess(err, 1e-6)
 
   def testGradientFloat64(self):
     with self.cached_session():
@@ -165,7 +136,7 @@ class ReluTest(test.TestCase):
           order="F")
       err = gradient_checker_v2.max_error(*gradient_checker_v2.compute_gradient(
           nn_ops.relu, [x], delta=1.0 / 1024))
-    self.assertLess(err, 1e-10)
+    self.assertLess(err, 1e-15)
 
   def testGradGradFloat32(self):
     with self.cached_session():
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
index 3ca0903c80c..ce5a4f76678 100644
--- a/tensorflow/python/ops/gradient_checker_v2.py
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -292,7 +292,7 @@ def _compute_gradient_list(f, xs, delta):
 
 
 @tf_export("test.compute_gradient", v1=[])
-def compute_gradient(f, x, delta=1e-3):
+def compute_gradient(f, x, delta=None):
   """Computes the theoretical and numeric Jacobian of `f`.
 
   With y = f(x), computes the theoretical and numeric Jacobian dy/dx.
@@ -329,6 +329,12 @@ def compute_gradient(f, x, delta=1e-3):
     raise ValueError(
         "`x` must be a list or tuple of values convertible to a Tensor "
         "(arguments to `f`), not a %s" % type(x))
+  if delta is None:
+    # By default, we use a step size for the central finite difference
+    # approximation that is exactly representable as a binary floating
+    # point number, since this reduces the amount of noise due to rounding
+    # in the approximation of some functions.
+    delta = 1.0 / 1024
   return _compute_gradient_list(f, x, delta)
 
 
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index b23d3b9f01b..2d4729f1867 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -18,7 +18,7 @@ tf_module {
   }
   member_method {
     name: "compute_gradient"
-    argspec: "args=[\'f\', \'x\', \'delta\'], varargs=None, keywords=None, defaults=[\'0.001\'], "
+    argspec: "args=[\'f\', \'x\', \'delta\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "create_local_cluster"