Implement gradient for SparseSoftmax.
Essentially the same calculation as Softmax, adapted to sparse inputs/outputs. Change: 123332988
This commit is contained in:
parent
6b8de8ac69
commit
008d41f4cc
@ -637,6 +637,17 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
|
||||
self.assertAllEqual(sp_t.indices.eval(), result.indices)
|
||||
self.assertAllEqual(shape, result.shape)
|
||||
|
||||
def testGradient(self):
|
||||
x_shape = [2, 5, 10]
|
||||
with self.test_session(use_gpu=False):
|
||||
for dtype in [np.float32, np.float64]:
|
||||
x_np = np.random.randn(*x_shape).astype(dtype)
|
||||
x_tf, nnz = _sparsify(x_np)
|
||||
y_tf = tf.sparse_softmax(x_tf)
|
||||
err = tf.test.compute_gradient_error(x_tf.values, (nnz,), y_tf.values,
|
||||
(nnz,))
|
||||
self.assertLess(err, 1e-4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
googletest.main()
|
||||
|
@ -227,6 +227,33 @@ def _SparseDenseCwiseDivGrad(op, grad):
|
||||
|
||||
|
||||
@ops.RegisterGradient("SparseSoftmax")
|
||||
def _SparseSoftmaxGrad(unused_op, unused_grad):
|
||||
raise NotImplementedError("SparseSoftmax op doesn't have its gradient"
|
||||
"implemented yet")
|
||||
def _SparseSoftmaxGrad(op, grad):
|
||||
"""Gradients for SparseSoftmax.
|
||||
|
||||
The calculation is the same as SoftmaxGrad:
|
||||
|
||||
grad_x = grad_softmax * softmax - sum(grad_softmax * softmax) * softmax
|
||||
|
||||
where we now only operate on the non-zero values present in the SparseTensors.
|
||||
|
||||
Args:
|
||||
op: the SparseSoftmax op.
|
||||
grad: the upstream gradient w.r.t. the non-zero SparseSoftmax output values.
|
||||
|
||||
Returns:
|
||||
Gradients w.r.t. the input (sp_indices, sp_values, sp_shape).
|
||||
"""
|
||||
indices, shape = op.inputs[0], op.inputs[2]
|
||||
out_vals = op.outputs[0]
|
||||
sp_output = ops.SparseTensor(indices, out_vals, shape)
|
||||
sp_grad = ops.SparseTensor(indices, grad, shape)
|
||||
sp_product = ops.SparseTensor(
|
||||
indices, sp_output.values * sp_grad.values, shape)
|
||||
|
||||
# [..., B, 1], dense.
|
||||
sum_reduced = -sparse_ops.sparse_reduce_sum(sp_product, [-1], keep_dims=True)
|
||||
# sparse [..., B, C] + dense [..., B, 1] with broadcast; outputs sparse.
|
||||
sp_sum = sparse_ops.sparse_dense_cwise_add(sp_grad, sum_reduced)
|
||||
|
||||
grad_x = sp_sum.values * sp_output.values
|
||||
return [None, grad_x, None]
|
||||
|
Loading…
Reference in New Issue
Block a user