Update gradient_checker_v2 to use a step size in the finite difference approximation that is exactly representable as a binary floating point number. This is an old trick that in some cases avoids polluting the finite difference approximation with rounding errors that cause false negatives in gradient tests.
PiperOrigin-RevId: 343348502 Change-Id: I3539ae7de7105177c5a1b9144b491f36369344f4
This commit is contained in:
parent
5f9f2d21d2
commit
1427bfc12e
@ -54,6 +54,7 @@
|
|||||||
* Corrected higher-order gradients of control flow constructs (`tf.cond`,
|
* Corrected higher-order gradients of control flow constructs (`tf.cond`,
|
||||||
`tf.while_loop`, and compositions like `tf.foldl`) computed with
|
`tf.while_loop`, and compositions like `tf.foldl`) computed with
|
||||||
`tf.GradientTape` inside a `tf.function`.
|
`tf.GradientTape` inside a `tf.function`.
|
||||||
|
* Changed the default step size in `gradient_checker_v2.compute_gradients` to be exactly representable as a binary floating point numbers. This avoids poluting gradient approximations needlessly, which is some cases leads to false negatives in op gradient tests.
|
||||||
|
|
||||||
* `tf.summary`:
|
* `tf.summary`:
|
||||||
* New `tf.summary.graph` allows manual write of TensorFlow graph
|
* New `tf.summary.graph` allows manual write of TensorFlow graph
|
||||||
|
@ -19,9 +19,7 @@ from __future__ import division
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from six.moves import xrange # pylint: disable=redefined-builtin
|
|
||||||
|
|
||||||
from tensorflow.python import tf2
|
|
||||||
from tensorflow.python.eager import backprop
|
from tensorflow.python.eager import backprop
|
||||||
from tensorflow.python.framework import constant_op
|
from tensorflow.python.framework import constant_op
|
||||||
from tensorflow.python.framework import dtypes
|
from tensorflow.python.framework import dtypes
|
||||||
@ -29,7 +27,6 @@ from tensorflow.python.framework import errors
|
|||||||
from tensorflow.python.framework import ops
|
from tensorflow.python.framework import ops
|
||||||
from tensorflow.python.framework import test_util
|
from tensorflow.python.framework import test_util
|
||||||
from tensorflow.python.ops import gradient_checker_v2
|
from tensorflow.python.ops import gradient_checker_v2
|
||||||
from tensorflow.python.ops import math_ops
|
|
||||||
from tensorflow.python.ops import nn_ops
|
from tensorflow.python.ops import nn_ops
|
||||||
from tensorflow.python.ops import random_ops
|
from tensorflow.python.ops import random_ops
|
||||||
from tensorflow.python.ops import variables
|
from tensorflow.python.ops import variables
|
||||||
@ -117,45 +114,19 @@ class ReluTest(test.TestCase):
|
|||||||
order="F")
|
order="F")
|
||||||
err = gradient_checker_v2.max_error(*gradient_checker_v2.compute_gradient(
|
err = gradient_checker_v2.max_error(*gradient_checker_v2.compute_gradient(
|
||||||
nn_ops.relu, [x], delta=1.0 / 1024))
|
nn_ops.relu, [x], delta=1.0 / 1024))
|
||||||
self.assertLess(err, 1e-4)
|
self.assertLess(err, 1e-6)
|
||||||
|
|
||||||
# The gradient for fp16 is inaccurate due to the low-precision.
|
# The gradient test for ReLU is a bit tricky as the derivative is not well
|
||||||
# We compare the fp16 analytical gradient against their fp32 counterpart.
|
# defined at around zero and we want to avoid that in terms of input values.
|
||||||
def testGradientFloat16(self):
|
def testGradientFloat16(self):
|
||||||
|
with self.cached_session():
|
||||||
def grad(x):
|
x = np.asarray(
|
||||||
with backprop.GradientTape() as tape:
|
[[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
|
||||||
tape.watch(x)
|
dtype=np.float16,
|
||||||
y = nn_ops.l2_loss(nn_ops.relu(x))
|
order="F")
|
||||||
return tape.gradient(y, x)
|
err = gradient_checker_v2.max_error(
|
||||||
|
*gradient_checker_v2.compute_gradient(nn_ops.relu, [x]))
|
||||||
def f():
|
self.assertLess(err, 1e-6)
|
||||||
with test_util.use_gpu():
|
|
||||||
# Randomly construct a 1D shape from [1, 40)
|
|
||||||
shape = random_ops.random_uniform([1],
|
|
||||||
minval=1,
|
|
||||||
maxval=40,
|
|
||||||
dtype=dtypes.int32)
|
|
||||||
x32 = random_ops.random_uniform(shape, minval=-1, maxval=1)
|
|
||||||
x16 = math_ops.cast(x32, dtype=dtypes.float16)
|
|
||||||
return grad(x32), grad(x16)
|
|
||||||
|
|
||||||
# We're going to ensure that the fp16 and fp32 gradients
|
|
||||||
# are "close" to each other for ~100 random values.
|
|
||||||
#
|
|
||||||
# In TensorFlow 1.x, invoking f() (without eager execution enabled)
|
|
||||||
# would construct a graph. Instead of construct a graph with O(100) nodes,
|
|
||||||
# we construct a single graph to be executed ~100 times in a Session.
|
|
||||||
if not tf2.enabled():
|
|
||||||
d32_tensor, d16_tensor = f()
|
|
||||||
with self.cached_session() as sess:
|
|
||||||
f = lambda: sess.run([d32_tensor, d16_tensor])
|
|
||||||
|
|
||||||
# Repeat the experiment for 100 times. All tensor shapes and its tensor
|
|
||||||
# values are randomly generated for each run.
|
|
||||||
for _ in xrange(100):
|
|
||||||
d32, d16 = f()
|
|
||||||
self.assertAllClose(d32, d16, atol=3e-4)
|
|
||||||
|
|
||||||
def testGradientFloat64(self):
|
def testGradientFloat64(self):
|
||||||
with self.cached_session():
|
with self.cached_session():
|
||||||
@ -165,7 +136,7 @@ class ReluTest(test.TestCase):
|
|||||||
order="F")
|
order="F")
|
||||||
err = gradient_checker_v2.max_error(*gradient_checker_v2.compute_gradient(
|
err = gradient_checker_v2.max_error(*gradient_checker_v2.compute_gradient(
|
||||||
nn_ops.relu, [x], delta=1.0 / 1024))
|
nn_ops.relu, [x], delta=1.0 / 1024))
|
||||||
self.assertLess(err, 1e-10)
|
self.assertLess(err, 1e-15)
|
||||||
|
|
||||||
def testGradGradFloat32(self):
|
def testGradGradFloat32(self):
|
||||||
with self.cached_session():
|
with self.cached_session():
|
||||||
|
@ -292,7 +292,7 @@ def _compute_gradient_list(f, xs, delta):
|
|||||||
|
|
||||||
|
|
||||||
@tf_export("test.compute_gradient", v1=[])
|
@tf_export("test.compute_gradient", v1=[])
|
||||||
def compute_gradient(f, x, delta=1e-3):
|
def compute_gradient(f, x, delta=None):
|
||||||
"""Computes the theoretical and numeric Jacobian of `f`.
|
"""Computes the theoretical and numeric Jacobian of `f`.
|
||||||
|
|
||||||
With y = f(x), computes the theoretical and numeric Jacobian dy/dx.
|
With y = f(x), computes the theoretical and numeric Jacobian dy/dx.
|
||||||
@ -329,6 +329,12 @@ def compute_gradient(f, x, delta=1e-3):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"`x` must be a list or tuple of values convertible to a Tensor "
|
"`x` must be a list or tuple of values convertible to a Tensor "
|
||||||
"(arguments to `f`), not a %s" % type(x))
|
"(arguments to `f`), not a %s" % type(x))
|
||||||
|
if delta is None:
|
||||||
|
# By default, we use a step size for the central finite difference
|
||||||
|
# approximation that is exactly representable as a binary floating
|
||||||
|
# point number, since this reduces the amount of noise due to rounding
|
||||||
|
# in the approximation of some functions.
|
||||||
|
delta = 1.0 / 1024
|
||||||
return _compute_gradient_list(f, x, delta)
|
return _compute_gradient_list(f, x, delta)
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ tf_module {
|
|||||||
}
|
}
|
||||||
member_method {
|
member_method {
|
||||||
name: "compute_gradient"
|
name: "compute_gradient"
|
||||||
argspec: "args=[\'f\', \'x\', \'delta\'], varargs=None, keywords=None, defaults=[\'0.001\'], "
|
argspec: "args=[\'f\', \'x\', \'delta\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||||
}
|
}
|
||||||
member_method {
|
member_method {
|
||||||
name: "create_local_cluster"
|
name: "create_local_cluster"
|
||||||
|
Loading…
Reference in New Issue
Block a user