Have tf.nn.relu of NaN output NaN.
Before it outputted 0. In rare cases, it is possible this may cause some models using float16 to start getting NaNs if they were overflowing, but such models were not working correctly anyway. Previously, an overflow on the forward pass could get turned into NaN by BatchNormalization, which could then get turned into 0 by relu, hiding the NaN. Such models were not working correctly, as all the work done in the forward pass before the relu was discarded. Also, on CPUs, relu(-0) is now 0, not -0. This is unintentional and arbitrary, but relu(-0) is already inconsistent across different devices and XLA and so -0 vs 0 cannot be relied on. I tested performance on a Titan V with the following program: import tensorflow as tf import time def bench(dtype): x = tf.random.normal((2 ** 28,), dtype=dtype) p = tf.constant(0.) # Warmup tf.nn.relu(x) start = time.time() for _ in range(100): tf.nn.relu(x) # Synchronize GPU by sending result of computation to CPU p = p + 1. p.numpy() end = time.time() print('time for %s: %s' % (dtype, end - start)) bench('float32') bench('float16') I ran 3 times with and without this change and took the average. The results show no difference in performance: Before: float32: 0.381 float16: 0.190 After: float32: 0.380 float16: 0.190 PiperOrigin-RevId: 357290491 Change-Id: Icc01f3bfb35ddfce97d4fd347167f0c41322e3ae
This commit is contained in:
parent
454d8c51cb
commit
14dae44e40
@ -4,7 +4,7 @@ op {
|
||||
description: <<END
|
||||
See: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
|
||||
Example usage:
|
||||
>>> tf.nn.relu([-2., 0., -0., 3.]).numpy()
|
||||
array([ 0., 0., -0., 3.], dtype=float32)
|
||||
>>> tf.nn.relu([-2., 0., 3.]).numpy()
|
||||
array([0., 0., 3.], dtype=float32)
|
||||
END
|
||||
}
|
||||
|
@ -32,7 +32,8 @@ struct Relu {
|
||||
// activations: same shape as "features".
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor activations) {
|
||||
activations.device(d) = features.cwiseMax(static_cast<T>(0));
|
||||
activations.device(d) =
|
||||
features.template cwiseMax<Eigen::PropagateNaN>(static_cast<T>(0));
|
||||
}
|
||||
};
|
||||
|
||||
@ -66,7 +67,8 @@ struct Relu6 {
|
||||
void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
|
||||
typename TTypes<T>::Tensor activations) {
|
||||
activations.device(d) =
|
||||
features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6));
|
||||
features.template cwiseMax<Eigen::PropagateNaN>(static_cast<T>(0))
|
||||
.template cwiseMin<Eigen::PropagateNaN>(static_cast<T>(6));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -104,6 +104,11 @@ class ReluTest(test.TestCase):
|
||||
def testNoElement(self):
|
||||
self._testRelu(np.array([[], []], dtype=np.float32))
|
||||
|
||||
@test_util.disable_xla("b/157978028: Does not yet pass with XLA")
|
||||
def testNaNPropagation(self):
|
||||
for t in [np.float16, np.float32, np.float64]:
|
||||
self._testRelu(np.array([-1, np.nan, 1, np.nan]).astype(t))
|
||||
|
||||
# The gradient test for ReLU is a bit tricky as the derivative is not well
|
||||
# defined at around zero and we want to avoid that in terms of input values.
|
||||
def testGradientFloat32(self):
|
||||
@ -234,6 +239,11 @@ class Relu6Test(test.TestCase):
|
||||
self._testRelu6(
|
||||
np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
|
||||
|
||||
@test_util.disable_xla("b/157978028: Does not yet pass with XLA")
|
||||
def testNaNPropagation(self):
|
||||
for t in [np.float16, np.float32, np.float64]:
|
||||
self._testRelu6(np.array([-1, np.nan, 1, 7, np.nan]).astype(t))
|
||||
|
||||
# The gradient test for ReLU6 is a bit tricky as the derivative is
|
||||
# not well defined at around zero and six and we want to avoid that
|
||||
# in terms of input values.
|
||||
@ -294,6 +304,11 @@ class LeakyReluTest(test.TestCase):
|
||||
np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
|
||||
alpha=0.1)
|
||||
|
||||
def testNaNPropagation(self):
|
||||
for t in [np.float16, np.float32, np.float64]:
|
||||
self._testLeakyRelu(np.array([-1, np.nan, 1, np.nan]).astype(t),
|
||||
alpha=0.2)
|
||||
|
||||
# The gradient test for Leaky ReLU is a bit tricky as the derivative is not
|
||||
# well defined at around zero and we want to avoid that in terms of input
|
||||
# values.
|
||||
@ -411,6 +426,10 @@ class EluTest(test.TestCase):
|
||||
for t in [np.float16, np.float32, np.float64]:
|
||||
self._testElu(np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
|
||||
|
||||
def testNaNPropagation(self):
|
||||
for t in [np.float16, np.float32, np.float64]:
|
||||
self._testElu(np.array([-1, np.nan, 1, np.nan]).astype(t))
|
||||
|
||||
def testGradientFloat32(self):
|
||||
with self.cached_session():
|
||||
x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
|
||||
|
Loading…
Reference in New Issue
Block a user