Change weight broadcasting for losses.sparse_softmax_cross_entropy to be the same as sparse_softmax_cross_entropy_with_logits. This means allowing greater than rank 2 inputs.

Also, fix some pydoc. Change: 144898465
2017-01-18 16:33:20 -08:00 · 2017-01-18 16:33:20 -08:00 · 91a6f2f4d6
commit 91a6f2f4d6
parent d8fd68242e
7 changed files with 361 additions and 97 deletions
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import math_ops
@ -215,5 +216,239 @@ class ConfusionMatrixTest(test.TestCase):
    self.assertEqual(tf_cm.dtype, np.int64)
 class RemoveSqueezableDimensionsTest(test.TestCase):
  def testBothScalarShape(self):
    label_values = 1.0
    prediction_values = 0.0
    static_labels, static_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            label_values, prediction_values))
    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
    predictions_placeholder = array_ops.placeholder(dtype=dtypes.float32)
    dynamic_labels, dynamic_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            labels_placeholder, predictions_placeholder))
    with self.test_session():
      self.assertAllEqual(label_values, static_labels.eval())
      self.assertAllEqual(prediction_values, static_predictions.eval())
      feed_dict = {
          labels_placeholder: label_values,
          predictions_placeholder: prediction_values
      }
      self.assertAllEqual(
          label_values, dynamic_labels.eval(feed_dict=feed_dict))
      self.assertAllEqual(
          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
  def testSameShape(self):
    label_values = np.ones(shape=(2, 3, 1))
    prediction_values = np.zeros_like(label_values)
    static_labels, static_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            label_values, prediction_values))
    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    dynamic_labels, dynamic_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            labels_placeholder, predictions_placeholder))
    with self.test_session():
      self.assertAllEqual(label_values, static_labels.eval())
      self.assertAllEqual(prediction_values, static_predictions.eval())
      feed_dict = {
          labels_placeholder: label_values,
          predictions_placeholder: prediction_values
      }
      self.assertAllEqual(
          label_values, dynamic_labels.eval(feed_dict=feed_dict))
      self.assertAllEqual(
          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
  def testSameShapeExpectedRankDiff0(self):
    label_values = np.ones(shape=(2, 3, 1))
    prediction_values = np.zeros_like(label_values)
    static_labels, static_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            label_values, prediction_values, expected_rank_diff=0))
    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    dynamic_labels, dynamic_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            labels_placeholder, predictions_placeholder, expected_rank_diff=0))
    with self.test_session():
      self.assertAllEqual(label_values, static_labels.eval())
      self.assertAllEqual(prediction_values, static_predictions.eval())
      feed_dict = {
          labels_placeholder: label_values,
          predictions_placeholder: prediction_values
      }
      self.assertAllEqual(
          label_values, dynamic_labels.eval(feed_dict=feed_dict))
      self.assertAllEqual(
          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
  def testSqueezableLabels(self):
    label_values = np.ones(shape=(2, 3, 1))
    prediction_values = np.zeros(shape=(2, 3))
    static_labels, static_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            label_values, prediction_values))
    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    dynamic_labels, dynamic_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            labels_placeholder, predictions_placeholder))
    expected_label_values = np.reshape(label_values, newshape=(2, 3))
    with self.test_session():
      self.assertAllEqual(expected_label_values, static_labels.eval())
      self.assertAllEqual(prediction_values, static_predictions.eval())
      feed_dict = {
          labels_placeholder: label_values,
          predictions_placeholder: prediction_values
      }
      self.assertAllEqual(
          expected_label_values, dynamic_labels.eval(feed_dict=feed_dict))
      self.assertAllEqual(
          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
  def testSqueezableLabelsExpectedRankDiffPlus1(self):
    label_values = np.ones(shape=(2, 3, 1))
    prediction_values = np.zeros(shape=(2, 3, 5))
    static_labels, static_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            label_values, prediction_values, expected_rank_diff=1))
    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    dynamic_labels, dynamic_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            labels_placeholder, predictions_placeholder, expected_rank_diff=1))
    expected_label_values = np.reshape(label_values, newshape=(2, 3))
    with self.test_session():
      self.assertAllEqual(expected_label_values, static_labels.eval())
      self.assertAllEqual(prediction_values, static_predictions.eval())
      feed_dict = {
          labels_placeholder: label_values,
          predictions_placeholder: prediction_values
      }
      self.assertAllEqual(
          expected_label_values, dynamic_labels.eval(feed_dict=feed_dict))
      self.assertAllEqual(
          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
  def testSqueezablePredictions(self):
    label_values = np.ones(shape=(2, 3))
    prediction_values = np.zeros(shape=(2, 3, 1))
    static_labels, static_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            label_values, prediction_values))
    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    dynamic_labels, dynamic_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            labels_placeholder, predictions_placeholder))
    expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
    with self.test_session():
      self.assertAllEqual(label_values, static_labels.eval())
      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
      feed_dict = {
          labels_placeholder: label_values,
          predictions_placeholder: prediction_values
      }
      self.assertAllEqual(
          label_values, dynamic_labels.eval(feed_dict=feed_dict))
      self.assertAllEqual(
          expected_prediction_values,
          dynamic_predictions.eval(feed_dict=feed_dict))
  def testSqueezablePredictionsExpectedRankDiffMinus1(self):
    label_values = np.ones(shape=(2, 3, 5))
    prediction_values = np.zeros(shape=(2, 3, 1))
    static_labels, static_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            label_values, prediction_values, expected_rank_diff=-1))
    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    dynamic_labels, dynamic_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            labels_placeholder, predictions_placeholder, expected_rank_diff=-1))
    expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
    with self.test_session():
      self.assertAllEqual(label_values, static_labels.eval())
      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
      feed_dict = {
          labels_placeholder: label_values,
          predictions_placeholder: prediction_values
      }
      self.assertAllEqual(
          label_values, dynamic_labels.eval(feed_dict=feed_dict))
      self.assertAllEqual(
          expected_prediction_values,
          dynamic_predictions.eval(feed_dict=feed_dict))
  def testUnsqueezableLabels(self):
    label_values = np.ones(shape=(2, 3, 2))
    prediction_values = np.zeros(shape=(2, 3))
    with self.assertRaisesRegexp(ValueError, r"Can not squeeze dim\[2\]"):
      confusion_matrix.remove_squeezable_dimensions(
          label_values, prediction_values)
    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    dynamic_labels, dynamic_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            labels_placeholder, predictions_placeholder))
    with self.test_session():
      feed_dict = {
          labels_placeholder: label_values,
          predictions_placeholder: prediction_values
      }
      with self.assertRaisesRegexp(
          errors_impl.InvalidArgumentError,
          "Tried to explicitly squeeze dimension 2"):
        dynamic_labels.eval(feed_dict=feed_dict)
      self.assertAllEqual(
          prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
  def testUnsqueezablePredictions(self):
    label_values = np.ones(shape=(2, 3))
    prediction_values = np.zeros(shape=(2, 3, 2))
    with self.assertRaisesRegexp(ValueError, r"Can not squeeze dim\[2\]"):
      confusion_matrix.remove_squeezable_dimensions(
          label_values, prediction_values)
    labels_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    predictions_placeholder = array_ops.placeholder(dtype=dtypes.int32)
    dynamic_labels, dynamic_predictions = (
        confusion_matrix.remove_squeezable_dimensions(
            labels_placeholder, predictions_placeholder))
    with self.test_session():
      feed_dict = {
          labels_placeholder: label_values,
          predictions_placeholder: prediction_values
      }
      self.assertAllEqual(
          label_values, dynamic_labels.eval(feed_dict=feed_dict))
      with self.assertRaisesRegexp(
          errors_impl.InvalidArgumentError,
          "Tried to explicitly squeeze dimension 2"):
        dynamic_predictions.eval(feed_dict=feed_dict)
 if __name__ == "__main__":
  test.main()
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@ -303,7 +303,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
    weights = 2.3
    with self.test_session():
      loss = losses.sparse_softmax_cross_entropy(
-          labels, logits, constant_op.constant(weights, shape=(1, 1)))
+          labels, logits, constant_op.constant((weights,)))
      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
  def testNonZeroLossWithPlaceholderForWeights(self):
@ -432,7 +432,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
      labels = constant_op.constant([[0, 1], [2, 3]])
      weights = constant_op.constant(1.2)
-      with self.assertRaises(errors_impl.InvalidArgumentError):
+      with self.assertRaisesRegexp(ValueError, 'dimension'):
        losses.sparse_softmax_cross_entropy(
            labels, logits, weights=weights).eval()
@ -1241,6 +1241,9 @@ class ComputeWeightedLossTest(test.TestCase):
  def testInvalid1x2Weight(self):
    self._test_invalid_weights((17.0, 3.0,),)
  def testInvalidScalar1DWeight(self):
    self._test_invalid_weights((17.0,),)
  def _test_valid_weights(self, weights):
    with ops.Graph().as_default():
      self.assertEqual(0, len(util.get_losses()))
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@ -32,8 +32,19 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
-def remove_squeezable_dimensions(labels, predictions, name=None):
+def remove_squeezable_dimensions(
-  """Squeeze last dim if ranks of `predictions` and `labels` differ by 1.
+    labels, predictions, expected_rank_diff=0, name=None):
  """Squeeze last dim if ranks differ from expected by exactly 1.
  In the common case where we expect shapes to match, `expected_rank_diff`
  defaults to 0, and we squeeze the last dimension of the larger rank if they
  differ by 1.
  But, for example, if `labels` contains class IDs and `predictions` contains 1
  probability per class, we expect `predictions` to have 1 more dimension than
  `labels`, so `expected_rank_diff` would be 1. In this case, we'd squeeze
  `labels` if `rank(predictions) - rank(labels) == 0`, and
  `predictions` if `rank(predictions) - rank(labels) == 2`.
  This will use static shape if available. Otherwise, it will add graph
  operations, which could result in a performance hit.
@ -41,6 +52,7 @@ def remove_squeezable_dimensions(labels, predictions, name=None):
  Args:
    labels: Label values, a `Tensor` whose dimensions match `predictions`.
    predictions: Predicted values, a `Tensor` of arbitrary dimensions.
    expected_rank_diff: Expected result of `rank(predictions) - rank(labels)`.
    name: Name of the op.
  Returns:
@ -57,10 +69,10 @@ def remove_squeezable_dimensions(labels, predictions, name=None):
    if (labels_rank is not None) and (predictions_rank is not None):
      # Use static rank.
      rank_diff = predictions_rank - labels_rank
-      if rank_diff == -1:
+      if rank_diff == expected_rank_diff + 1:
        labels = array_ops.squeeze(labels, [-1])
      elif rank_diff == 1:
        predictions = array_ops.squeeze(predictions, [-1])
      elif rank_diff == expected_rank_diff - 1:
        labels = array_ops.squeeze(labels, [-1])
      return labels, predictions
    # Use dynamic rank.
@ -68,13 +80,13 @@ def remove_squeezable_dimensions(labels, predictions, name=None):
    if (predictions_rank is None) or (
        predictions_shape.dims[-1].is_compatible_with(1)):
      predictions = control_flow_ops.cond(
-          math_ops.equal(1, rank_diff),
+          math_ops.equal(expected_rank_diff + 1, rank_diff),
          lambda: array_ops.squeeze(predictions, [-1]),
          lambda: predictions)
    if (labels_rank is None) or (
        labels_shape.dims[-1].is_compatible_with(1)):
      labels = control_flow_ops.cond(
-          math_ops.equal(-1, rank_diff),
+          math_ops.equal(expected_rank_diff - 1, rank_diff),
          lambda: array_ops.squeeze(labels, [-1]),
          lambda: labels)
    return labels, predictions
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@ -22,6 +22,8 @@ py_library(
    srcs_version = "PY2AND3",
    deps = [
        "//tensorflow/python:array_ops",
        "//tensorflow/python:confusion_matrix",
        "//tensorflow/python:control_flow_ops",
        "//tensorflow/python:framework_for_generated_wrappers",
        "//tensorflow/python:math_ops",
        "//tensorflow/python:nn",
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@ -20,6 +20,8 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
@ -130,16 +132,11 @@ def compute_weighted_loss(
    losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES):
  """Computes the weighted loss.
  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
  not work as advertised, you'll wind up with weighted sum instead of weighted
  mean for any but the last dimension. This will be cleaned up soon, so please
  do not rely on the current behavior for anything but the shapes documented for
  `weights` below.
  Args:
    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
-    weights: `Tensor` of shape `[]`, `[batch_size]` or
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `[batch_size, d1, ... dK]`, where K < N.
+      `losses`, and must be broadcastable to `losses` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    scope: the scope for the operations performed in computing the loss.
    loss_collection: the loss will be added to these collections.
@ -180,17 +177,12 @@ def absolute_difference(
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.
  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
  not work as advertised, you'll wind up with weighted sum instead of weighted
  mean for any but the last dimension. This will be cleaned up soon, so please
  do not rely on the current behavior for anything but the shapes documented for
  `weights` below.
  Args:
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    predictions: The predicted outputs.
-    weights: Coefficients for the loss a scalar, a tensor of shape
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `[batch_size]` or a tensor whose shape matches `predictions`.
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which this loss will be added.
@ -218,18 +210,13 @@ def cosine_distance(
  Note that the function assumes that `predictions` and `labels` are already
  unit-normalized.
  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
  not work as advertised, you'll wind up with weighted sum instead of weighted
  mean for any but the last dimension. This will be cleaned up soon, so please
  do not rely on the current behavior for anything but the shapes documented for
  `weights` below.
  Args:
    labels: `Tensor` whose shape matches 'predictions'
    predictions: An arbitrary matrix.
    dim: The dimension along which the cosine distance is computed.
-    weights: Coefficients for the loss a scalar, a tensor of shape
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `[batch_size]` or a tensor whose shape matches `predictions`.
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which this loss will be added.
@ -257,18 +244,13 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
               loss_collection=ops.GraphKeys.LOSSES):
  """Adds a hinge loss to the training procedure.
  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
  not work as advertised, you'll wind up with weighted sum instead of weighted
  mean for any but the last dimension. This will be cleaned up soon, so please
  do not rely on the current behavior for anything but the shapes documented for
  `weights` below.
  Args:
    labels: The ground truth output tensor. Its shape should match the shape of
      logits. The values of the tensor are expected to be 0.0 or 1.0.
    logits: The logits, a float tensor.
-    weights: Coefficients for the loss a scalar, a tensor of shape
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `[batch_size]` or a tensor whose shape matches `predictions`.
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
@ -302,17 +284,12 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.
  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
  not work as advertised, you'll wind up with weighted sum instead of weighted
  mean for any but the last dimension. This will be cleaned up soon, so please
  do not rely on the current behavior for anything but the shapes documented for
  `weights` below.
  Args:
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    predictions: The predicted outputs.
-    weights: Coefficients for the loss a scalar, a tensor of shape
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `[batch_size]` or a tensor whose shape matches `predictions`.
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    epsilon: A small increment to add to avoid taking a log of zero.
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
@ -427,17 +404,12 @@ def mean_squared_error(labels, predictions, weights=1.0, scope=None,
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.
  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
  not work as advertised, you'll wind up with weighted sum instead of weighted
  mean for any but the last dimension. This will be cleaned up soon, so please
  do not rely on the current behavior for anything but the shapes documented for
  `weights` below.
  Args:
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    predictions: The predicted outputs.
-    weights: Coefficients for the loss a scalar, a tensor of shape
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `[batch_size]` or a tensor whose shape matches `predictions`.
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
@ -467,12 +439,6 @@ def sigmoid_cross_entropy(
  tensor of shape `[batch_size]`, then the loss weights apply to each
  corresponding sample.
  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
  not work as advertised, you'll wind up with weighted sum instead of weighted
  mean for any but the last dimension. This will be cleaned up soon, so please
  do not rely on the current behavior for anything but the shapes documented for
  `weights` below.
  If `label_smoothing` is nonzero, smooth the labels towards 1/2:
      new_multiclass_labels = multiclass_labels * (1 - label_smoothing)
@ -482,8 +448,9 @@ def sigmoid_cross_entropy(
    multi_class_labels: `[batch_size, num_classes]` target integer labels in
      `(0, 1)`.
    logits: `[batch_size, num_classes]` logits outputs of the network.
-    weights: Coefficients for the loss. This must be of shape `[]`,
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `[batch_size]` or `[batch_size, num_classes]`.
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    label_smoothing: If greater than `0` then smooth the labels.
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
@ -522,12 +489,6 @@ def softmax_cross_entropy(
  tensor of shape `[batch_size]`, then the loss weights apply to each
  corresponding sample.
  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
  not work as advertised, you'll wind up with weighted sum instead of weighted
  mean for any but the last dimension. This will be cleaned up soon, so please
  do not rely on the current behavior for anything but the shapes documented for
  `weights` below.
  If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
      new_onehot_labels = onehot_labels * (1 - label_smoothing)
                          + label_smoothing / num_classes
@ -535,8 +496,10 @@ def softmax_cross_entropy(
  Args:
    onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels.
    logits: [batch_size, num_classes] logits outputs of the network .
-    weights: Coefficients for the loss. This must be of shape `[]`,
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `[batch_size]` or `[batch_size, num_classes]`.
+      `onehot_labels`, and must be broadcastable to `onehot_labels` (i.e., all
      dimensions must be either `1`, or the same as the corresponding `losses`
      dimension).
    label_smoothing: If greater than 0 then smooth the labels.
    scope: the scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
@ -567,6 +530,57 @@ def softmax_cross_entropy(
    return compute_weighted_loss(losses, weights, scope, loss_collection)
 # TODO(ptucker): Merge this with similar method in metrics_impl.
 def _remove_squeezable_dimensions(
    labels, predictions, weights=None, expected_rank_diff=0):
  """Internal version of _remove_squeezable_dimensions which handles weights.
  Squeezes `predictions` and `labels` if their ranks differ from expected by
  exactly 1.
  Squeezes `weights` if its rank is 1 more than the new rank of `predictions`
  This will use static shape if available. Otherwise, it will add graph
  operations, which could result in a performance hit.
  Args:
    labels: Label values, a `Tensor` whose dimensions match `predictions`.
    predictions: Predicted values, a `Tensor` of arbitrary dimensions.
    weights: Optional weight `Tensor`. It will be squeezed if it's not scalar,
      and its rank is 1 more than the new rank of `labels`.
    expected_rank_diff: Expected result of `rank(predictions) - rank(labels)`.
  Returns:
    Tuple of `predictions`, `labels` and `weights`, possibly with the last
    dimension squeezed.
  """
  labels, predictions = confusion_matrix.remove_squeezable_dimensions(
      labels, predictions, expected_rank_diff=expected_rank_diff)
  if weights is not None:
    weights = ops.convert_to_tensor(weights)
    labels_rank = labels.get_shape().ndims
    weights_shape = weights.get_shape()
    weights_rank = weights_shape.ndims
    if (labels_rank is not None) and (weights_rank is not None):
      # Use static rank.
      rank_diff = weights_rank - labels_rank
      if rank_diff == 1:
        weights = array_ops.squeeze(weights, [-1])
      return labels, predictions, weights
    # Use dynamic rank.
    rank_diff = array_ops.rank(weights) - array_ops.rank(labels)
    if (weights_rank is None) or (
        weights_shape.dims[-1].is_compatible_with(1)):
      weights = control_flow_ops.cond(
          math_ops.equal(1, rank_diff),
          lambda: array_ops.squeeze(weights, [-1]),
          lambda: weights)
  return labels, predictions, weights
 def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
                                 loss_collection=ops.GraphKeys.LOSSES):
  """Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`.
@ -576,18 +590,16 @@ def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
  tensor of shape [`batch_size`], then the loss weights apply to each
  corresponding sample.
  WARNING: `weights` also supports dimensions of 1, but the broadcasting does
  not work as advertised, you'll wind up with weighted sum instead of weighted
  mean for any but the last dimension. This will be cleaned up soon, so please
  do not rely on the current behavior for anything but the shapes documented for
  `weights` below.
  Args:
-    labels: [batch_size, 1] or [batch_size] target labels of dtype `int32` or
+    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
-      `int64` in the range `[0, num_classes)`.
+      `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
-    logits: [batch_size, num_classes] logits outputs of the network .
+      must be an index in `[0, num_classes)`. Other values will raise an
-    weights: Coefficients for the loss. This must be scalar or of shape
+      exception when this op is run on CPU, and return `NaN` for corresponding
-      `[batch_size, 1]`.
+      loss and gradient rows on GPU.
    logits: Unscaled log probabilities of shape
      `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float32` or `float64`.
    weights: Coefficients for the loss. This must be scalar or of same rank as
      `labels`
    scope: the scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
@ -600,12 +612,12 @@ def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
  """
  with ops.name_scope(scope, "sparse_softmax_cross_entropy_loss",
                      (logits, labels, weights)) as scope:
-    labels = array_ops.reshape(labels, shape=[array_ops.shape(labels)[0]])
+    # As documented above in Args, labels contain class IDs and logits contains
-
+    # 1 probability per class ID, so we expect rank(logits) - rank(labels) == 1;
    # therefore, expected_rank_diff=1.
    labels, logits, weights = _remove_squeezable_dimensions(
        labels, logits, weights, expected_rank_diff=1)
    losses = nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                         logits=logits,
                                                         name="xentropy")
    # Reshape losses to [batch_size, 1] to be consistent with weights.
    # TODO(ptucker): reshape to (-1, 1) more efficient?
    losses = array_ops.reshape(losses, shape=[array_ops.shape(losses)[0], 1])
    return compute_weighted_loss(losses, weights, scope, loss_collection)
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@ -87,7 +87,7 @@ def _remove_squeezable_dimensions(labels, predictions, weights):
        weights = array_ops.squeeze(weights, [-1])
    elif (weights_rank is None) or (
        weights_shape.dims[-1].is_compatible_with(1)):
-      # Use dynamic rank
+      # Use dynamic rank.
      weights = control_flow_ops.cond(
          math_ops.equal(array_ops.rank(weights),
                         math_ops.add(array_ops.rank(predictions), 1)),
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@ -1663,13 +1663,13 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=
  Args:
    _sentinel: Used to prevent positional parameters. Internal, do not use.
-    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
+    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
-      `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
+      `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
-      Other values will raise an exception when this op is run on CPU, and
+      must be an index in `[0, num_classes)`. Other values will raise an
-      return `NaN` for corresponding corresponding loss and gradient rows
+      exception when this op is run on CPU, and return `NaN` for corresponding
-      on GPU.
+      loss and gradient rows on GPU.
-    logits: Unscaled log probabilities of rank `r` and shape
+    logits: Unscaled log probabilities of shape
-      `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
+      `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float32` or `float64`.
    name: A name for the operation (optional).
  Returns: