Implement gradient for SparseSoftmax.

Essentially the same calculation as Softmax, adapted to sparse inputs/outputs. Change: 123332988
2016-05-26 09:37:42 -08:00 · 2016-05-26 09:37:42 -08:00 · 008d41f4cc
commit 008d41f4cc
parent 6b8de8ac69
2 changed files with 41 additions and 3 deletions
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@ -637,6 +637,17 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
        self.assertAllEqual(sp_t.indices.eval(), result.indices)
        self.assertAllEqual(shape, result.shape)

+  def testGradient(self):
+    x_shape = [2, 5, 10]
+    with self.test_session(use_gpu=False):
+      for dtype in [np.float32, np.float64]:
+        x_np = np.random.randn(*x_shape).astype(dtype)
+        x_tf, nnz = _sparsify(x_np)
+        y_tf = tf.sparse_softmax(x_tf)
+        err = tf.test.compute_gradient_error(x_tf.values, (nnz,), y_tf.values,
+                                             (nnz,))
+        self.assertLess(err, 1e-4)
+

 if __name__ == "__main__":
  googletest.main()
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@ -227,6 +227,33 @@ def _SparseDenseCwiseDivGrad(op, grad):


@ops.RegisterGradient("SparseSoftmax")
-def _SparseSoftmaxGrad(unused_op, unused_grad):
-  raise NotImplementedError("SparseSoftmax op doesn't have its gradient"
-                            "implemented yet")
+def _SparseSoftmaxGrad(op, grad):
+  """Gradients for SparseSoftmax.
+
+  The calculation is the same as SoftmaxGrad:
+
+    grad_x = grad_softmax * softmax - sum(grad_softmax * softmax) * softmax
+
+  where we now only operate on the non-zero values present in the SparseTensors.
+
+  Args:
+    op: the SparseSoftmax op.
+    grad: the upstream gradient w.r.t. the non-zero SparseSoftmax output values.
+
+  Returns:
+    Gradients w.r.t. the input (sp_indices, sp_values, sp_shape).
+  """
+  indices, shape = op.inputs[0], op.inputs[2]
+  out_vals = op.outputs[0]
+  sp_output = ops.SparseTensor(indices, out_vals, shape)
+  sp_grad = ops.SparseTensor(indices, grad, shape)
+  sp_product = ops.SparseTensor(
+      indices, sp_output.values * sp_grad.values, shape)
+
+  # [..., B, 1], dense.
+  sum_reduced = -sparse_ops.sparse_reduce_sum(sp_product, [-1], keep_dims=True)
+  # sparse [..., B, C] + dense [..., B, 1] with broadcast; outputs sparse.
+  sp_sum = sparse_ops.sparse_dense_cwise_add(sp_grad, sum_reduced)
+
+  grad_x = sp_sum.values * sp_output.values
+  return [None, grad_x, None]