From 008d41f4cc9f80fc4f29fc90da5b0ae361624fed Mon Sep 17 00:00:00 2001
From: Zongheng Yang <zongheng.y@gmail.com>
Date: Thu, 26 May 2016 09:37:42 -0800
Subject: [PATCH] Implement gradient for SparseSoftmax.

Essentially the same calculation as Softmax, adapted to sparse inputs/outputs.
Change: 123332988
---
 .../python/kernel_tests/sparse_ops_test.py    | 11 +++++++
 tensorflow/python/ops/sparse_grad.py          | 33 +++++++++++++++++--
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 037d1f2c3eb..f6474f58458 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -637,6 +637,17 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(sp_t.indices.eval(), result.indices)
         self.assertAllEqual(shape, result.shape)
 
+  def testGradient(self):
+    x_shape = [2, 5, 10]
+    with self.test_session(use_gpu=False):
+      for dtype in [np.float32, np.float64]:
+        x_np = np.random.randn(*x_shape).astype(dtype)
+        x_tf, nnz = _sparsify(x_np)
+        y_tf = tf.sparse_softmax(x_tf)
+        err = tf.test.compute_gradient_error(x_tf.values, (nnz,), y_tf.values,
+                                             (nnz,))
+        self.assertLess(err, 1e-4)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py
index d2e1ae20967..16c45aba544 100644
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@@ -227,6 +227,33 @@ def _SparseDenseCwiseDivGrad(op, grad):
 
 
 @ops.RegisterGradient("SparseSoftmax")
-def _SparseSoftmaxGrad(unused_op, unused_grad):
-  raise NotImplementedError("SparseSoftmax op doesn't have its gradient"
-                            "implemented yet")
+def _SparseSoftmaxGrad(op, grad):
+  """Gradients for SparseSoftmax.
+
+  The calculation is the same as SoftmaxGrad:
+
+    grad_x = grad_softmax * softmax - sum(grad_softmax * softmax) * softmax
+
+  where we now only operate on the non-zero values present in the SparseTensors.
+
+  Args:
+    op: the SparseSoftmax op.
+    grad: the upstream gradient w.r.t. the non-zero SparseSoftmax output values.
+
+  Returns:
+    Gradients w.r.t. the input (sp_indices, sp_values, sp_shape).
+  """
+  indices, shape = op.inputs[0], op.inputs[2]
+  out_vals = op.outputs[0]
+  sp_output = ops.SparseTensor(indices, out_vals, shape)
+  sp_grad = ops.SparseTensor(indices, grad, shape)
+  sp_product = ops.SparseTensor(
+      indices, sp_output.values * sp_grad.values, shape)
+
+  # [..., B, 1], dense.
+  sum_reduced = -sparse_ops.sparse_reduce_sum(sp_product, [-1], keep_dims=True)
+  # sparse [..., B, C] + dense [..., B, 1] with broadcast; outputs sparse.
+  sp_sum = sparse_ops.sparse_dense_cwise_add(sp_grad, sum_reduced)
+
+  grad_x = sp_sum.values * sp_output.values
+  return [None, grad_x, None]