Have tf.nn.relu of NaN output NaN.

Before it outputted 0.

In rare cases, it is possible this may cause some models using float16 to start getting NaNs if they were overflowing, but such models were not working correctly anyway. Previously, an overflow on the forward pass could get turned into NaN by BatchNormalization, which could then get turned into 0 by relu, hiding the NaN. Such models were not working correctly, as all the work done in the forward pass before the relu was discarded.

Also, on CPUs, relu(-0) is now 0, not -0. This is unintentional and arbitrary, but relu(-0) is already inconsistent across different devices and XLA and so -0 vs 0 cannot be relied on.

I tested performance on a Titan V with the following program:

    import tensorflow as tf
    import time

    def bench(dtype):
      x = tf.random.normal((2 ** 28,), dtype=dtype)
      p = tf.constant(0.)

      # Warmup
      tf.nn.relu(x)

      start = time.time()
      for _ in range(100):
        tf.nn.relu(x)
      # Synchronize GPU by sending result of computation to CPU
      p = p + 1.
      p.numpy()

      end = time.time()
      print('time for %s: %s' % (dtype, end - start))

    bench('float32')
    bench('float16')

I ran 3 times with and without this change and took the average. The results show no difference in performance:

Before:
float32: 0.381
float16: 0.190

After:
float32: 0.380
float16: 0.190
PiperOrigin-RevId: 357290491
Change-Id: Icc01f3bfb35ddfce97d4fd347167f0c41322e3ae
This commit is contained in:
Reed Wanderman-Milne 2021-02-12 16:31:55 -08:00 committed by TensorFlower Gardener
parent 454d8c51cb
commit 14dae44e40
3 changed files with 25 additions and 4 deletions

View File

@ -4,7 +4,7 @@ op {
description: <<END
See: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
Example usage:
>>> tf.nn.relu([-2., 0., -0., 3.]).numpy()
array([ 0., 0., -0., 3.], dtype=float32)
>>> tf.nn.relu([-2., 0., 3.]).numpy()
array([0., 0., 3.], dtype=float32)
END
}

View File

@ -32,7 +32,8 @@ struct Relu {
// activations: same shape as "features".
void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
typename TTypes<T>::Tensor activations) {
activations.device(d) = features.cwiseMax(static_cast<T>(0));
activations.device(d) =
features.template cwiseMax<Eigen::PropagateNaN>(static_cast<T>(0));
}
};
@ -66,7 +67,8 @@ struct Relu6 {
void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
typename TTypes<T>::Tensor activations) {
activations.device(d) =
features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6));
features.template cwiseMax<Eigen::PropagateNaN>(static_cast<T>(0))
.template cwiseMin<Eigen::PropagateNaN>(static_cast<T>(6));
}
};

View File

@ -104,6 +104,11 @@ class ReluTest(test.TestCase):
def testNoElement(self):
self._testRelu(np.array([[], []], dtype=np.float32))
@test_util.disable_xla("b/157978028: Does not yet pass with XLA")
def testNaNPropagation(self):
for t in [np.float16, np.float32, np.float64]:
self._testRelu(np.array([-1, np.nan, 1, np.nan]).astype(t))
# The gradient test for ReLU is a bit tricky as the derivative is not well
# defined at around zero and we want to avoid that in terms of input values.
def testGradientFloat32(self):
@ -234,6 +239,11 @@ class Relu6Test(test.TestCase):
self._testRelu6(
np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
@test_util.disable_xla("b/157978028: Does not yet pass with XLA")
def testNaNPropagation(self):
for t in [np.float16, np.float32, np.float64]:
self._testRelu6(np.array([-1, np.nan, 1, 7, np.nan]).astype(t))
# The gradient test for ReLU6 is a bit tricky as the derivative is
# not well defined at around zero and six and we want to avoid that
# in terms of input values.
@ -294,6 +304,11 @@ class LeakyReluTest(test.TestCase):
np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
alpha=0.1)
def testNaNPropagation(self):
for t in [np.float16, np.float32, np.float64]:
self._testLeakyRelu(np.array([-1, np.nan, 1, np.nan]).astype(t),
alpha=0.2)
# The gradient test for Leaky ReLU is a bit tricky as the derivative is not
# well defined at around zero and we want to avoid that in terms of input
# values.
@ -411,6 +426,10 @@ class EluTest(test.TestCase):
for t in [np.float16, np.float32, np.float64]:
self._testElu(np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
def testNaNPropagation(self):
for t in [np.float16, np.float32, np.float64]:
self._testElu(np.array([-1, np.nan, 1, np.nan]).astype(t))
def testGradientFloat32(self):
with self.cached_session():
x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]