From 008d41f4cc9f80fc4f29fc90da5b0ae361624fed Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Thu, 26 May 2016 09:37:42 -0800 Subject: [PATCH] Implement gradient for SparseSoftmax. Essentially the same calculation as Softmax, adapted to sparse inputs/outputs. Change: 123332988 --- .../python/kernel_tests/sparse_ops_test.py | 11 +++++++ tensorflow/python/ops/sparse_grad.py | 33 +++++++++++++++++-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py index 037d1f2c3eb..f6474f58458 100644 --- a/tensorflow/python/kernel_tests/sparse_ops_test.py +++ b/tensorflow/python/kernel_tests/sparse_ops_test.py @@ -637,6 +637,17 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase): self.assertAllEqual(sp_t.indices.eval(), result.indices) self.assertAllEqual(shape, result.shape) + def testGradient(self): + x_shape = [2, 5, 10] + with self.test_session(use_gpu=False): + for dtype in [np.float32, np.float64]: + x_np = np.random.randn(*x_shape).astype(dtype) + x_tf, nnz = _sparsify(x_np) + y_tf = tf.sparse_softmax(x_tf) + err = tf.test.compute_gradient_error(x_tf.values, (nnz,), y_tf.values, + (nnz,)) + self.assertLess(err, 1e-4) + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py index d2e1ae20967..16c45aba544 100644 --- a/tensorflow/python/ops/sparse_grad.py +++ b/tensorflow/python/ops/sparse_grad.py @@ -227,6 +227,33 @@ def _SparseDenseCwiseDivGrad(op, grad): @ops.RegisterGradient("SparseSoftmax") -def _SparseSoftmaxGrad(unused_op, unused_grad): - raise NotImplementedError("SparseSoftmax op doesn't have its gradient" - "implemented yet") +def _SparseSoftmaxGrad(op, grad): + """Gradients for SparseSoftmax. + + The calculation is the same as SoftmaxGrad: + + grad_x = grad_softmax * softmax - sum(grad_softmax * softmax) * softmax + + where we now only operate on the non-zero values present in the SparseTensors. + + Args: + op: the SparseSoftmax op. + grad: the upstream gradient w.r.t. the non-zero SparseSoftmax output values. + + Returns: + Gradients w.r.t. the input (sp_indices, sp_values, sp_shape). + """ + indices, shape = op.inputs[0], op.inputs[2] + out_vals = op.outputs[0] + sp_output = ops.SparseTensor(indices, out_vals, shape) + sp_grad = ops.SparseTensor(indices, grad, shape) + sp_product = ops.SparseTensor( + indices, sp_output.values * sp_grad.values, shape) + + # [..., B, 1], dense. + sum_reduced = -sparse_ops.sparse_reduce_sum(sp_product, [-1], keep_dims=True) + # sparse [..., B, C] + dense [..., B, 1] with broadcast; outputs sparse. + sp_sum = sparse_ops.sparse_dense_cwise_add(sp_grad, sum_reduced) + + grad_x = sp_sum.values * sp_output.values + return [None, grad_x, None]