Treat SparseApply* on empty sparse gradients as no-op

PiperOrigin-RevId: 353040731 Change-Id: Ibe213b002efc2622c86fb936c477c1e13820f3e4
2021-01-21 09:51:06 -08:00 · 2021-01-21 09:51:06 -08:00 · 8cb8c460a3
commit 8cb8c460a3
parent 50b1c27aca
2 changed files with 47 additions and 18 deletions
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@ -512,6 +512,9 @@ struct SparseApplyAdagrad<GPUDevice, T, Tindex, has_epsilon> {
    const Tindex first_dim_size = var.dimension(0);
    const Tindex grad_size = grad.size();
    const Tindex indices_size = indices.size();
+    if (grad_size == 0) {
+      return Status::OK();
+    }
    GpuLaunchConfig config = GetGpuLaunchConfig(grad_size, d);
    return GpuLaunchKernel(
        SparseApplyAdagradKernel<T, Tindex, has_epsilon>, config.block_count,
@ -570,6 +573,9 @@ struct SparseApplyProximalAdagrad<GPUDevice, T, Tindex> {
    const Tindex first_dim_size = var.dimension(0);
    const Tindex grad_size = grad.size();
    const Tindex indices_size = indices.size();
+    if (grad_size == 0) {
+      return Status::OK();
+    }
    GpuLaunchConfig config = GetGpuLaunchConfig(grad_size, d);
    return GpuLaunchKernel(SparseApplyProximalAdagradKernel<T, Tindex>,
                           config.block_count, config.thread_per_block, 0,
@ -777,6 +783,9 @@ struct SparseApplyFtrl<GPUDevice, T, Tindex, has_l2_shrinkage> {
    const Tindex first_dim_size = var.dimension(0);
    const Tindex grad_size = grad.size();
    const Tindex indices_size = indices.size();
+    if (grad_size == 0) {
+      return Status::OK();
+    }
    GpuLaunchConfig config = GetGpuLaunchConfig(grad_size, d);
    return GpuLaunchKernel(
        SparseApplyFtrlKernel<T, Tindex, has_l2_shrinkage>, config.block_count,
@ -846,12 +855,14 @@ struct SparseApplyKerasMomentum<GPUDevice, T, Tindex> {
    const Tindex first_dim_size = var.dimension(0);
    const Tindex grad_size = grad.size();
    const Tindex indices_size = indices.size();
-    GpuLaunchConfig config = GetGpuLaunchConfig(grad_size, d);
-    TF_CHECK_OK(GpuLaunchKernel(
-        SparseApplyKerasMomentumKernel<T, Tindex>, config.block_count,
-        config.thread_per_block, 0, d.stream(), var.data(), accum.data(),
-        lr.data(), grad.data(), indices.data(), momentum.data(), use_nesterov,
-        first_dim_size, grad_size, indices_size));
+    if (grad_size != 0) {
+      GpuLaunchConfig config = GetGpuLaunchConfig(grad_size, d);
+      TF_CHECK_OK(GpuLaunchKernel(
+          SparseApplyKerasMomentumKernel<T, Tindex>, config.block_count,
+          config.thread_per_block, 0, d.stream(), var.data(), accum.data(),
+          lr.data(), grad.data(), indices.data(), momentum.data(), use_nesterov,
+          first_dim_size, grad_size, indices_size));
+    }
    return static_cast<Tindex>(-1);
  }
 };
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@ -223,9 +223,9 @@ class TrainingOpsTest(TensorFlowTestCase):
      self._testTypesForFtrlMultiplyLinearByLr(
          x, y, z, lr, grad, use_gpu=False, l1=l1, l2=l2)

-  def _testTypesForSparseAdagrad(self, x, y, lr, grad, indices):
+  def _testTypesForSparseAdagrad(self, x, y, lr, grad, indices, use_gpu):
    self.setUp()
-    with self.session(use_gpu=True):
+    with self.session(use_gpu=use_gpu):
      var = variables.VariableV1(x)
      accum = variables.VariableV1(y)
      self.evaluate(variables.global_variables_initializer())
@ -251,11 +251,12 @@ class TrainingOpsTest(TensorFlowTestCase):
                              lr,
                              grad,
                              indices,
+                              use_gpu,
                              l1=0.0,
                              l2=0.0,
                              lr_power=-0.5):
    self.setUp()
-    with self.session(use_gpu=False):
+    with self.session(use_gpu=use_gpu):
      var = variables.VariableV1(x)
      accum = variables.VariableV1(y)
      linear = variables.VariableV1(z)
@ -327,8 +328,9 @@ class TrainingOpsTest(TensorFlowTestCase):
  @test_util.run_v1_only("SparseApplyAdagrad op returns a ref, so it is not "
                         "supported in eager mode.")
  def testSparseApplyAdagrad(self):
-    for (dtype, index_type) in itertools.product(
-        [np.float16, np.float32, np.float64], [np.int32, np.int64]):
+    for (dtype, index_type,
+         use_gpu) in itertools.product([np.float16, np.float32, np.float64],
+                                       [np.int32, np.int64], [False, True]):
      x_val = [np.arange(10), np.arange(10, 20), np.arange(20, 30)]
      y_val = [np.arange(1, 11), np.arange(11, 21), np.arange(21, 31)]
      x = np.array(x_val).astype(dtype)
@ -337,13 +339,19 @@ class TrainingOpsTest(TensorFlowTestCase):
      grad_val = [np.arange(10), np.arange(10)]
      grad = np.array(grad_val).astype(dtype)
      indices = np.array([0, 2]).astype(index_type)
-      self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
+      self._testTypesForSparseAdagrad(x, y, lr, grad, indices, use_gpu)
+      # Empty sparse gradients.
+      empty_grad = np.zeros([0, 10], dtype=dtype)
+      empty_indices = np.zeros([0], dtype=index_type)
+      self._testTypesForSparseAdagrad(x, y, lr, empty_grad, empty_indices,
+                                      use_gpu)

  @test_util.run_v1_only("SparseApplyAdagrad op returns a ref, so it is not "
                         "supported in eager mode.")
  def testSparseApplyAdagradDim1(self):
-    for (dtype, index_type) in itertools.product(
-        [np.float16, np.float32, np.float64], [np.int32, np.int64]):
+    for (dtype, index_type,
+         use_gpu) in itertools.product([np.float16, np.float32, np.float64],
+                                       [np.int32, np.int64], [False, True]):
      x_val = [[1.0], [2.0], [3.0]]
      y_val = [[4.0], [5.0], [6.0]]
      x = np.array(x_val).astype(dtype)
@ -352,13 +360,18 @@ class TrainingOpsTest(TensorFlowTestCase):
      grad_val = [[1.5], [2.5]]
      grad = np.array(grad_val).astype(dtype)
      indices = np.array([0, 2]).astype(index_type)
-      self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
+      self._testTypesForSparseAdagrad(x, y, lr, grad, indices, use_gpu)

  @test_util.run_v1_only("SparseApplyFtrl op returns a ref, so it is not "
                         "supported in eager mode.")
  def testSparseApplyFtrlDim1(self):
-    for (dtype, index_type) in itertools.product(
-        [np.float16, np.float32, np.float64], [np.int32, np.int64]):
+    for (dtype, index_type,
+         use_gpu) in itertools.product([np.float16, np.float32, np.float64],
+                                       [np.int32, np.int64], [False, True]):
+      # TODO(b/178042695): This configuration leads to a "too many resources
+      # requested for launch" error.
+      if (dtype, index_type, use_gpu) == (np.float64, np.int64, True):
+        continue
      x_val = [[0.0], [0.0], [0.0]]
      y_val = [[4.0], [5.0], [6.0]]
      z_val = [[0.0], [0.0], [0.0]]
@ -369,7 +382,12 @@ class TrainingOpsTest(TensorFlowTestCase):
      grad_val = [[1.5], [2.5]]
      grad = np.array(grad_val).astype(dtype)
      indices = np.array([0, 2]).astype(index_type)
-      self._testTypesForSparseFtrl(x, y, z, lr, grad, indices)
+      self._testTypesForSparseFtrl(x, y, z, lr, grad, indices, use_gpu)
+      # Empty sparse gradients.
+      empty_grad = np.zeros([0, 1], dtype=dtype)
+      empty_indices = np.zeros([0], dtype=index_type)
+      self._testTypesForSparseFtrl(x, y, z, lr, empty_grad, empty_indices,
+                                   use_gpu)

  @test_util.run_v1_only("SparseApplyFtrlMultiplyLinearByLr op returns a ref, "
                         "so it is not supported in eager mode.")