Have tf.nn.relu of NaN output NaN.

Before it outputted 0. In rare cases, it is possible this may cause some models using float16 to start getting NaNs if they were overflowing, but such models were not working correctly anyway. Previously, an overflow on the forward pass could get turned into NaN by BatchNormalization, which could then get turned into 0 by relu, hiding the NaN. Such models were not working correctly, as all the work done in the forward pass before the relu was discarded. Also, on CPUs, relu(-0) is now 0, not -0. This is unintentional and arbitrary, but relu(-0) is already inconsistent across different devices and XLA and so -0 vs 0 cannot be relied on. I tested performance on a Titan V with the following program: import tensorflow as tf import time def bench(dtype): x = tf.random.normal((2 ** 28,), dtype=dtype) p = tf.constant(0.) # Warmup tf.nn.relu(x) start = time.time() for _ in range(100): tf.nn.relu(x) # Synchronize GPU by sending result of computation to CPU p = p + 1. p.numpy() end = time.time() print('time for %s: %s' % (dtype, end - start)) bench('float32') bench('float16') I ran 3 times with and without this change and took the average. The results show no difference in performance: Before: float32: 0.381 float16: 0.190 After: float32: 0.380 float16: 0.190 PiperOrigin-RevId: 357290491 Change-Id: Icc01f3bfb35ddfce97d4fd347167f0c41322e3ae
2021-02-12 16:31:55 -08:00 · 2021-02-12 16:31:55 -08:00 · 14dae44e40
commit 14dae44e40
parent 454d8c51cb
3 changed files with 25 additions and 4 deletions
--- a/tensorflow/core/api_def/base_api/api_def_Relu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Relu.pbtxt
@ -4,7 +4,7 @@ op {
  description: <<END
 See: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
 Example usage:
->>> tf.nn.relu([-2., 0., -0., 3.]).numpy()
-array([ 0.,  0., -0.,  3.], dtype=float32)
+>>> tf.nn.relu([-2., 0., 3.]).numpy()
+array([0., 0., 3.], dtype=float32)
 END
 }
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@ -32,7 +32,8 @@ struct Relu {
  // activations: same shape as "features".
  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
                  typename TTypes<T>::Tensor activations) {
-    activations.device(d) = features.cwiseMax(static_cast<T>(0));
+    activations.device(d) =
+        features.template cwiseMax<Eigen::PropagateNaN>(static_cast<T>(0));
  }
 };

@ -66,7 +67,8 @@ struct Relu6 {
  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
                  typename TTypes<T>::Tensor activations) {
    activations.device(d) =
-        features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6));
+        features.template cwiseMax<Eigen::PropagateNaN>(static_cast<T>(0))
+            .template cwiseMin<Eigen::PropagateNaN>(static_cast<T>(6));
  }
 };

--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@ -104,6 +104,11 @@ class ReluTest(test.TestCase):
  def testNoElement(self):
    self._testRelu(np.array([[], []], dtype=np.float32))

+  @test_util.disable_xla("b/157978028: Does not yet pass with XLA")
+  def testNaNPropagation(self):
+    for t in [np.float16, np.float32, np.float64]:
+      self._testRelu(np.array([-1, np.nan, 1, np.nan]).astype(t))
+
  # The gradient test for ReLU is a bit tricky as the derivative is not well
  # defined at around zero and we want to avoid that in terms of input values.
  def testGradientFloat32(self):
@ -234,6 +239,11 @@ class Relu6Test(test.TestCase):
      self._testRelu6(
          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))

+  @test_util.disable_xla("b/157978028: Does not yet pass with XLA")
+  def testNaNPropagation(self):
+    for t in [np.float16, np.float32, np.float64]:
+      self._testRelu6(np.array([-1, np.nan, 1, 7, np.nan]).astype(t))
+
  # The gradient test for ReLU6 is a bit tricky as the derivative is
  # not well defined at around zero and six and we want to avoid that
  # in terms of input values.
@ -294,6 +304,11 @@ class LeakyReluTest(test.TestCase):
          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
          alpha=0.1)

+  def testNaNPropagation(self):
+    for t in [np.float16, np.float32, np.float64]:
+      self._testLeakyRelu(np.array([-1, np.nan, 1, np.nan]).astype(t),
+                          alpha=0.2)
+
  # The gradient test for Leaky ReLU is a bit tricky as the derivative is not
  # well defined at around zero and we want to avoid that in terms of input
  # values.
@ -411,6 +426,10 @@ class EluTest(test.TestCase):
    for t in [np.float16, np.float32, np.float64]:
      self._testElu(np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))

+  def testNaNPropagation(self):
+    for t in [np.float16, np.float32, np.float64]:
+      self._testElu(np.array([-1, np.nan, 1, np.nan]).astype(t))
+
  def testGradientFloat32(self):
    with self.cached_session():
      x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]