diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index 89e84ca535f..e20e494b9eb 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -33,19 +33,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "confusion_matrix_ops_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/confusion_matrix_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":metrics_py",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 py_test(
     name = "histogram_ops_test",
     size = "medium",
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index 3ad53655bc1..aaa1b62d5f7 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -133,7 +133,6 @@ labels and predictions tensors and results in a weighted average of the metric.
 @@auc_using_histogram
 
 @@accuracy
-@@confusion_matrix
 
 @@aggregate_metrics
 @@aggregate_metric_map
diff --git a/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py b/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
index dd57f0478be..81bbe935e74 100644
--- a/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/confusion_matrix_ops.py
@@ -18,93 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework import tensor_util
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import confusion_matrix as cm
 
 
-def confusion_matrix(predictions, labels, num_classes=None, dtype=dtypes.int32,
+def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
                      name=None, weights=None):
-  """Computes the confusion matrix from predictions and labels.
-
-  Calculate the Confusion Matrix for a pair of prediction and
-  label 1-D int arrays.
-
-  The matrix rows represent the prediction labels and the columns
-  represents the real labels. The confusion matrix is always a 2-D array
-  of shape `[n, n]`, where `n` is the number of valid labels for a given
-  classification task. Both prediction and labels must be 1-D arrays of
-  the same shape in order for this function to work.
-
-  If `num_classes` is None, then `num_classes` will be set to the one plus
-  the maximum value in either predictions or labels.
-  Class labels are expected to start at 0. E.g., if `num_classes` was
-  three, then the possible labels would be `[0, 1, 2]`.
-
-  If `weights` is not `None`, then each prediction contributes its
-  corresponding weight to the total value of the confusion matrix cell.
-
-  For example:
-
-  ```python
-    tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
-        [[0 0 0 0 0]
-         [0 0 1 0 0]
-         [0 0 1 0 0]
-         [0 0 0 0 0]
-         [0 0 0 0 1]]
-  ```
-
-  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
-  resulting in a 5x5 confusion matrix.
-
-  Args:
-    predictions: A 1-D array representing the predictions for a given
-                 classification.
-    labels: A 1-D representing the real labels for the classification task.
-    num_classes: The possible number of labels the classification task can
-                 have. If this value is not provided, it will be calculated
-                 using both predictions and labels array.
-    dtype: Data type of the confusion matrix.
-    name: Scope name.
-    weights: An optional `Tensor` whose shape matches `predictions`.
-
-  Returns:
-    A k X k matrix representing the confusion matrix, where k is the number of
-    possible labels in the classification task.
-
-  Raises:
-    ValueError: If both predictions and labels are not 1-D vectors and have
-      mismatched shapes, or if `weights` is not `None` and its shape doesn't
-      match `predictions`.
-  """
-  with ops.name_scope(name, 'confusion_matrix',
-                      [predictions, labels, num_classes]) as name:
-    predictions, labels = tensor_util.remove_squeezable_dimensions(
-        ops.convert_to_tensor(
-            predictions, name='predictions'),
-        ops.convert_to_tensor(labels, name='labels'))
-    predictions = math_ops.cast(predictions, dtypes.int64)
-    labels = math_ops.cast(labels, dtypes.int64)
-
-    if num_classes is None:
-      num_classes = math_ops.maximum(math_ops.reduce_max(predictions),
-                                     math_ops.reduce_max(labels)) + 1
-
-    if weights is not None:
-      predictions.get_shape().assert_is_compatible_with(weights.get_shape())
-      weights = math_ops.cast(weights, dtype)
-
-    shape = array_ops.pack([num_classes, num_classes])
-    indices = array_ops.transpose(array_ops.pack([predictions, labels]))
-    values = (array_ops.ones_like(predictions, dtype)
-              if weights is None else weights)
-    cm_sparse = sparse_tensor.SparseTensor(
-        indices=indices, values=values, shape=math_ops.to_int64(shape))
-    zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)
-
-    return sparse_ops.sparse_add(zero_matrix, cm_sparse)
+  """Deprecated. Use tf.confusion_matrix instead."""
+  return cm.confusion_matrix(labels=labels, predictions=predictions,
+                             num_classes=num_classes, dtype=dtype, name=name,
+                             weights=weights)
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 172f6976cc9..d3f7c9018fe 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -25,7 +25,6 @@ from __future__ import print_function
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import tensor_util
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
-from tensorflow.contrib.metrics.python.ops import confusion_matrix_ops
 from tensorflow.contrib.metrics.python.ops import set_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,6 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
@@ -178,16 +178,10 @@ def streaming_true_positives(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'true_positives', (predictions, labels, weights)):
-
-    predictions = ops.convert_to_tensor(predictions)
-    labels = ops.convert_to_tensor(labels)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_true_positive = math_ops.logical_and(math_ops.equal(labels, 1),
-                                            math_ops.equal(predictions, 1))
-    return _count_condition(is_true_positive, weights, metrics_collections,
-                            updates_collections)
+  return metrics.true_positives(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_true_negatives(predictions, labels, weights=None,
@@ -262,16 +256,10 @@ def streaming_false_positives(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'false_positives', (predictions, labels, weights)):
-
-    predictions = ops.convert_to_tensor(predictions)
-    labels = ops.convert_to_tensor(labels)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_false_positive = math_ops.logical_and(math_ops.equal(labels, 0),
-                                             math_ops.equal(predictions, 1))
-    return _count_condition(is_false_positive, weights, metrics_collections,
-                            updates_collections)
+  return metrics.false_positives(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_false_negatives(predictions, labels, weights=None,
@@ -303,16 +291,10 @@ def streaming_false_negatives(predictions, labels, weights=None,
       or if either `metrics_collections` or `updates_collections` are not a list
       or tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'false_negatives', (predictions, labels, weights)):
-
-    predictions = ops.convert_to_tensor(predictions)
-    labels = ops.convert_to_tensor(labels)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_false_negative = math_ops.logical_and(math_ops.equal(labels, 1),
-                                             math_ops.equal(predictions, 0))
-    return _count_condition(is_false_negative, weights, metrics_collections,
-                            updates_collections)
+  return metrics.false_negatives(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _broadcast_weights(weights, values):
@@ -376,33 +358,9 @@ def streaming_mean(values, weights=None, metrics_collections=None,
       or if either `metrics_collections` or `updates_collections` are not a list
       or tuple.
   """
-  with variable_scope.variable_scope(name, 'mean', (values, weights)):
-    values = math_ops.to_float(values)
-
-    total = _create_local('total', shape=[])
-    count = _create_local('count', shape=[])
-
-    if weights is not None:
-      weights = math_ops.to_float(weights)
-      values = math_ops.mul(values, weights)
-      num_values = math_ops.reduce_sum(_broadcast_weights(weights, values))
-    else:
-      num_values = math_ops.to_float(array_ops.size(values))
-
-    total_compute_op = state_ops.assign_add(total, math_ops.reduce_sum(values))
-    count_compute_op = state_ops.assign_add(count, num_values)
-
-    mean = _safe_div(total, count, 'value')
-    with ops.control_dependencies([total_compute_op, count_compute_op]):
-      update_op = _safe_div(total, count, 'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return mean, update_op
+  return metrics.mean(
+      values=values, weights=weights, metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_mean_tensor(values, weights=None, metrics_collections=None,
@@ -445,36 +403,9 @@ def streaming_mean_tensor(values, weights=None, metrics_collections=None,
       or if either `metrics_collections` or `updates_collections` are not a list
       or tuple.
   """
-  with variable_scope.variable_scope(name, 'mean', (values, weights)):
-    total = _create_local('total_tensor', shape=values.get_shape())
-    count = _create_local('count_tensor', shape=values.get_shape())
-
-    num_values = array_ops.ones_like(values)
-    if weights is not None:
-      weights = math_ops.to_float(weights)
-      values = math_ops.mul(values, weights)
-      num_values = math_ops.mul(num_values, weights)
-
-    total_compute_op = state_ops.assign_add(total, values)
-    count_compute_op = state_ops.assign_add(count, num_values)
-
-    def compute_mean(total, count, name):
-      non_zero_count = math_ops.maximum(count,
-                                        array_ops.ones_like(count),
-                                        name=name)
-      return math_ops.truediv(total, non_zero_count, name=name)
-
-    mean = compute_mean(total, count, 'value')
-    with ops.control_dependencies([total_compute_op, count_compute_op]):
-      update_op = compute_mean(total, count, 'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return mean, update_op
+  return metrics.mean_tensor(
+      values=values, weights=weights, metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_accuracy(predictions, labels, weights=None,
@@ -520,14 +451,10 @@ def streaming_accuracy(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions, labels, weights=weights)
-  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-  if labels.dtype != predictions.dtype:
-    predictions = math_ops.cast(predictions, labels.dtype)
-  is_correct = math_ops.to_float(math_ops.equal(predictions, labels))
-  return streaming_mean(is_correct, weights, metrics_collections,
-                        updates_collections, name or 'accuracy')
+  return metrics.accuracy(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_precision(predictions, labels, weights=None,
@@ -572,39 +499,10 @@ def streaming_precision(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'precision', (predictions, labels, weights)):
-
-    predictions, labels, weights = _remove_squeezable_dimensions(
-        predictions, labels, weights)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-
-    true_positives, true_positives_update_op = streaming_true_positives(
-        predictions, labels, weights, metrics_collections=None,
-        updates_collections=None, name=None)
-    false_positives, false_positives_update_op = streaming_false_positives(
-        predictions, labels, weights, metrics_collections=None,
-        updates_collections=None, name=None)
-
-    def compute_precision(name):
-      return array_ops.where(
-          math_ops.greater(true_positives + false_positives, 0),
-          math_ops.div(true_positives, true_positives + false_positives),
-          0,
-          name)
-
-    precision = compute_precision('value')
-    with ops.control_dependencies([true_positives_update_op,
-                                   false_positives_update_op]):
-      update_op = compute_precision('update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, precision)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return precision, update_op
+  return metrics.precision(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_recall(predictions, labels, weights=None,
@@ -647,38 +545,10 @@ def streaming_recall(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'recall', (predictions, labels, weights)):
-    predictions, labels, weights = _remove_squeezable_dimensions(
-        predictions, labels, weights)
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-
-    true_positives, true_positives_update_op = streaming_true_positives(
-        predictions, labels, weights, metrics_collections=None,
-        updates_collections=None, name=None)
-    false_negatives, false_negatives_update_op = streaming_false_negatives(
-        predictions, labels, weights, metrics_collections=None,
-        updates_collections=None, name=None)
-
-    def compute_recall(true_positives, false_negatives, name):
-      return array_ops.where(
-          math_ops.greater(true_positives + false_negatives, 0),
-          math_ops.div(true_positives, true_positives + false_negatives),
-          0,
-          name)
-
-    recall = compute_recall(true_positives, false_negatives, 'value')
-    with ops.control_dependencies([true_positives_update_op,
-                                   false_negatives_update_op]):
-      update_op = compute_recall(true_positives, false_negatives, 'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, recall)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return recall, update_op
+  return metrics.recall(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _streaming_confusion_matrix_at_thresholds(
@@ -903,50 +773,10 @@ def streaming_auc(predictions, labels, weights=None, num_thresholds=200,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'auc', (predictions, labels, weights)):
-    if curve != 'ROC' and  curve != 'PR':
-      raise ValueError('curve must be either ROC or PR, %s unknown' %
-                       (curve))
-    kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
-    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
-
-    values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        predictions, labels, thresholds, weights)
-
-    # Add epsilons to avoid dividing by 0.
-    epsilon = 1.0e-6
-    def compute_auc(tp, fn, tn, fp, name):
-      """Computes the roc-auc or pr-auc based on confusion counts."""
-      recall = math_ops.div(tp + epsilon, tp + fn + epsilon)
-      if curve == 'ROC':
-        fp_rate = math_ops.div(fp, fp + tn + epsilon)
-        x = fp_rate
-        y = recall
-      else:  # curve == 'PR'.
-        precision = math_ops.div(tp + epsilon, tp + fp + epsilon)
-        x = recall
-        y = precision
-      return math_ops.reduce_sum(math_ops.mul(
-          x[:num_thresholds - 1] - x[1:],
-          (y[:num_thresholds - 1] + y[1:]) / 2.), name=name)
-
-    # sum up the areas of all the trapeziums
-    auc = compute_auc(
-        values['tp'], values['fn'], values['tn'], values['fp'], 'value')
-    update_op = compute_auc(
-        update_ops['tp'], update_ops['fn'], update_ops['tn'], update_ops['fp'],
-        'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, auc)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return auc, update_op
+  return metrics.auc(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections, num_thresholds=num_thresholds,
+      curve=curve, updates_collections=updates_collections, name=name)
 
 
 def streaming_specificity_at_sensitivity(
@@ -998,60 +828,11 @@ def streaming_specificity_at_sensitivity(
       `sensitivity` is not between 0 and 1, or if either `metrics_collections`
       or `updates_collections` are not a list or tuple.
   """
-  if sensitivity < 0 or sensitivity > 1:
-    raise ValueError('`sensitivity` must be in the range [0, 1].')
-
-  with variable_scope.variable_scope(name, 'specificity_at_sensitivity',
-                                     (predictions, labels, weights)):
-    kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
-    thresholds = [0.0 - kepsilon] + thresholds + [1.0 - kepsilon]
-
-    values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        predictions, labels, thresholds, weights)
-    tp = values['tp']
-    fn = values['fn']
-    tn = values['tn']
-    fp = values['fp']
-
-    def compute_specificity_at_sensitivity(name):
-      """Computes the specificity at the given sensitivity.
-
-      Args:
-        name: The name of the operation.
-
-      Returns:
-        The specificity using the aggregated values.
-      """
-      sensitivities = math_ops.div(tp, tp + fn + kepsilon)
-
-      # We'll need to use this trick until tf.argmax allows us to specify
-      # whether we should use the first or last index in case of ties.
-      min_val = math_ops.reduce_min(math_ops.abs(sensitivities - sensitivity))
-      indices_at_minval = math_ops.equal(
-          math_ops.abs(sensitivities - sensitivity), min_val)
-      indices_at_minval = math_ops.to_int64(indices_at_minval)
-      indices_at_minval = math_ops.cumsum(indices_at_minval)
-      tf_index = math_ops.argmax(indices_at_minval, 0)
-      tf_index = math_ops.cast(tf_index, dtypes.int32)
-
-      # Now, we have the implicit threshold, so compute the specificity:
-      return math_ops.div(tn[tf_index],
-                          tn[tf_index] + fp[tf_index] + kepsilon,
-                          name)
-
-    specificity = compute_specificity_at_sensitivity('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_specificity_at_sensitivity('update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, specificity)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return specificity, update_op
+  return metrics.specificity_at_sensitivity(
+      sensitivity=sensitivity, num_thresholds=num_thresholds,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_sensitivity_at_specificity(
@@ -1103,44 +884,11 @@ def streaming_sensitivity_at_specificity(
       `specificity` is not between 0 and 1, or if either `metrics_collections`
       or `updates_collections` are not a list or tuple.
   """
-  if specificity < 0 or specificity > 1:
-    raise ValueError('`specificity` must be in the range [0, 1].')
-
-  with variable_scope.variable_scope(name, 'sensitivity_at_specificity',
-                                     (predictions, labels, weights)):
-    kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
-    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
-
-    values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        predictions, labels, thresholds, weights)
-    tp = values['tp']
-    fn = values['fn']
-    tn = values['tn']
-    fp = values['fp']
-
-    def compute_sensitivity_at_specificity(name):
-      specificities = math_ops.div(tn, tn + fp + kepsilon)
-      tf_index = math_ops.argmin(math_ops.abs(specificities - specificity), 0)
-      tf_index = math_ops.cast(tf_index, dtypes.int32)
-
-      # Now, we have the implicit threshold, so compute the sensitivity:
-      return math_ops.div(tp[tf_index],
-                          tp[tf_index] + fn[tf_index] + kepsilon,
-                          name)
-
-    sensitivity = compute_sensitivity_at_specificity('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_sensitivity_at_specificity('update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, sensitivity)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return sensitivity, update_op
+  return metrics.sensitivity_at_specificity(
+      specificity=specificity, num_thresholds=num_thresholds,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_precision_at_thresholds(predictions, labels, thresholds,
@@ -1187,29 +935,11 @@ def streaming_precision_at_thresholds(predictions, labels, thresholds,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(name, 'precision_at_thresholds',
-                                     (predictions, labels, weights)):
-    values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        predictions, labels, thresholds, weights, includes=('tp', 'fp'))
-    tp = values['tp']
-    fp = values['fp']
-
-    # Avoid division by zero.
-    epsilon = 1e-7
-    def compute_precision(name):
-      return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
-
-    precision = compute_precision('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_precision('update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, precision)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return precision, update_op
+  return metrics.precision_at_thresholds(
+      thresholds=thresholds,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_recall_at_thresholds(predictions, labels, thresholds,
@@ -1253,29 +983,11 @@ def streaming_recall_at_thresholds(predictions, labels, thresholds,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(name, 'recall_at_thresholds',
-                                     (predictions, labels, weights)):
-    values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        predictions, labels, thresholds, weights, includes=('tp', 'fn'))
-    tp = values['tp']
-    fn = values['fn']
-
-    # Avoid division by zero.
-    epsilon = 1e-7
-    def compute_recall(name):
-      return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
-
-    recall = compute_recall('value')
-    with ops.control_dependencies(update_ops.values()):
-      update_op = compute_recall('update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, recall)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return recall, update_op
+  return metrics.recall_at_thresholds(
+      thresholds=thresholds,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _at_k_name(name, k=None, class_id=None):
@@ -1413,25 +1125,11 @@ def streaming_sparse_recall_at_k(predictions,
     `predictions`, or if either `metrics_collections` or `updates_collections`
     are not a list or tuple.
   """
-  default_name = _at_k_name('recall', k, class_id=class_id)
-  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
-    _, top_k_idx = nn.top_k(predictions, k)
-    top_k_idx = math_ops.to_int64(top_k_idx)
-    tp, tp_update = _streaming_sparse_true_positive_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
-        weights=weights)
-    fn, fn_update = _streaming_sparse_false_negative_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
-        weights=weights)
-
-    metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
-    update = math_ops.div(
-        tp_update, math_ops.add(tp_update, fn_update), name='update')
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, metric)
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update)
-    return metric, update
+  return metrics.recall_at_k(
+      k=k, class_id=class_id,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _streaming_sparse_precision_at_k(top_k_idx,
@@ -1575,19 +1273,11 @@ def streaming_sparse_precision_at_k(predictions,
       `predictions`, or if either `metrics_collections` or `updates_collections`
       are not a list or tuple.
   """
-  default_name = _at_k_name('precision', k, class_id=class_id)
-  with ops.name_scope(name, default_name,
-                      (predictions, labels, weights)) as scope:
-    _, top_k_idx = nn.top_k(predictions, k)
-    return _streaming_sparse_precision_at_k(
-        top_k_idx=top_k_idx,
-        labels=labels,
-        k=k,
-        class_id=class_id,
-        weights=weights,
-        metrics_collections=metrics_collections,
-        updates_collections=updates_collections,
-        name=scope)
+  return metrics.sparse_precision_at_k(
+      k=k, class_id=class_id,
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 # TODO(ptucker): Validate range of values in labels?
@@ -1918,50 +1608,10 @@ def streaming_sparse_average_precision_at_k(predictions,
     update: `Operation` that increments  variables appropriately, and whose
       value matches `metric`.
   """
-  default_name = _at_k_name('average_precision', k)
-  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
-    # Calculate per-example average precision, and apply weights.
-    average_precision = sparse_average_precision_at_k(
-        predictions=predictions, labels=labels, k=k)
-    if weights is not None:
-      weights = math_ops.to_double(weights)
-      average_precision = math_ops.mul(average_precision, weights)
-
-    # Create accumulation variables and update ops for max average precision and
-    # total average precision.
-    with ops.name_scope(None, 'max', (average_precision,)) as max_scope:
-      # `max` is the max possible precision. Since max for any row is 1.0:
-      # - For the unweighted case, this is just the number of rows.
-      # - For the weighted case, it's the sum of the weights broadcast across
-      #   `average_precision` rows.
-      max_var = contrib_variables.local_variable(
-          array_ops.zeros([], dtype=dtypes.float64), name=max_scope)
-      if weights is None:
-        batch_max = math_ops.to_double(
-            array_ops.size(average_precision, name='batch_max'))
-      else:
-        # TODO(ptucker): More efficient way to broadcast?
-        broadcast_weights = math_ops.mul(
-            weights, array_ops.ones_like(average_precision),
-            name='broadcast_weights')
-        batch_max = math_ops.reduce_sum(broadcast_weights, name='batch_max')
-      max_update = state_ops.assign_add(max_var, batch_max, name='update')
-    with ops.name_scope(None, 'total', (average_precision,)) as total_scope:
-      total_var = contrib_variables.local_variable(
-          array_ops.zeros([], dtype=dtypes.float64), name=total_scope)
-      batch_total = math_ops.reduce_sum(average_precision, name='batch_total')
-      total_update = state_ops.assign_add(total_var, batch_total, name='update')
-
-    # Divide total by max to get mean, for both vars and the update ops.
-    mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
-    update = _safe_scalar_div(total_update, max_update, name=scope)
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_average_precision)
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update)
-
-    return mean_average_precision, update
+  return metrics.sparse_average_precision_at_k(
+      k=k, predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _select_class_id(ids, selected_id):
@@ -2329,12 +1979,10 @@ def streaming_mean_absolute_error(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions, labels, weights)
-  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-  absolute_errors = math_ops.abs(predictions - labels)
-  return streaming_mean(absolute_errors, weights, metrics_collections,
-                        updates_collections, name or 'mean_absolute_error')
+  return metrics.mean_absolute_error(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_mean_relative_error(predictions, labels, normalizer, weights=None,
@@ -2382,19 +2030,10 @@ def streaming_mean_relative_error(predictions, labels, normalizer, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions, labels, weights)
-  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-
-  predictions, normalizer = tensor_util.remove_squeezable_dimensions(
-      predictions, normalizer)
-  predictions.get_shape().assert_is_compatible_with(normalizer.get_shape())
-  relative_errors = array_ops.where(
-      math_ops.equal(normalizer, 0.0),
-      array_ops.zeros_like(labels),
-      math_ops.div(math_ops.abs(labels - predictions), normalizer))
-  return streaming_mean(relative_errors, weights, metrics_collections,
-                        updates_collections, name or 'mean_relative_error')
+  return metrics.mean_relative_error(
+      normalizer=normalizer, predictions=predictions, labels=labels,
+      weights=weights, metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_mean_squared_error(predictions, labels, weights=None,
@@ -2441,12 +2080,10 @@ def streaming_mean_squared_error(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions, labels, weights)
-  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-  squared_error = math_ops.square(labels - predictions)
-  return streaming_mean(squared_error, weights, metrics_collections,
-                        updates_collections, name or 'mean_squared_error')
+  return metrics.mean_squared_error(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_root_mean_squared_error(predictions, labels, weights=None,
@@ -2493,24 +2130,10 @@ def streaming_root_mean_squared_error(predictions, labels, weights=None,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions, labels, weights)
-  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-  value_tensor, update_op = streaming_mean_squared_error(
-      predictions, labels, weights, None, None,
-      name or 'root_mean_squared_error')
-
-  root_mean_squared_error = math_ops.sqrt(value_tensor)
-  with ops.control_dependencies([update_op]):
-    update_op = math_ops.sqrt(update_op)
-
-  if metrics_collections:
-    ops.add_to_collections(metrics_collections, root_mean_squared_error)
-
-  if updates_collections:
-    ops.add_to_collections(updates_collections, update_op)
-
-  return root_mean_squared_error, update_op
+  return metrics.root_mean_squared_error(
+      predictions=predictions, labels=labels, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_covariance(predictions,
@@ -2825,12 +2448,10 @@ def streaming_percentage_less(values, threshold, weights=None,
       or if either `metrics_collections` or `updates_collections` are not a list
       or tuple.
   """
-  is_below_threshold = math_ops.to_float(math_ops.less(values, threshold))
-  return streaming_mean(is_below_threshold,
-                        weights,
-                        metrics_collections,
-                        updates_collections,
-                        name or 'percentage_below_threshold')
+  return metrics.percentage_below(
+      values=values, threshold=threshold, weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def streaming_mean_iou(predictions,
@@ -2881,65 +2502,10 @@ def streaming_mean_iou(predictions,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(
-      name, 'mean_iou', (predictions, labels, weights)):
-    # Check if shape is compatible.
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-
-    # Local variable to accumulate the predictions in the confusion matrix.
-    cm_dtype = dtypes.int64 if weights is not None else dtypes.float64
-    total_cm = _create_local('total_confusion_matrix',
-                             shape=[num_classes, num_classes], dtype=cm_dtype)
-
-    # Cast the type to int64 required by confusion_matrix_ops.
-    predictions = math_ops.to_int64(predictions)
-    labels = math_ops.to_int64(labels)
-    num_classes = math_ops.to_int64(num_classes)
-
-    # Flatten the input if its rank > 1.
-    predictions_rank = predictions.get_shape().ndims
-    if predictions_rank > 1:
-      predictions = array_ops.reshape(predictions, [-1])
-
-    labels_rank = labels.get_shape().ndims
-    if labels_rank > 1:
-      labels = array_ops.reshape(labels, [-1])
-
-    if weights is not None:
-      weights_rank = weights.get_shape().ndims
-      if weights_rank > 1:
-        weights = array_ops.reshape(weights, [-1])
-
-    # Accumulate the prediction to current confusion matrix.
-    current_cm = confusion_matrix_ops.confusion_matrix(
-        predictions, labels, num_classes, weights=weights, dtype=cm_dtype)
-    update_op = state_ops.assign_add(total_cm, current_cm)
-
-    def compute_mean_iou(name):
-      """Compute the mean intersection-over-union via the confusion matrix."""
-      sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
-      sum_over_col = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
-      cm_diag = math_ops.to_float(array_ops.diag_part(total_cm))
-      denominator = sum_over_row + sum_over_col - cm_diag
-
-      # If the value of the denominator is 0, set it to 1 to avoid
-      # zero division.
-      denominator = array_ops.where(
-          math_ops.greater(denominator, 0),
-          denominator,
-          array_ops.ones_like(denominator))
-      iou = math_ops.div(cm_diag, denominator)
-      return math_ops.reduce_mean(iou, name=name)
-
-    mean_iou = compute_mean_iou('mean_iou')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_iou)
-
-    if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
-
-    return mean_iou, update_op
+  return metrics.mean_iou(
+      num_classes=num_classes, predictions=predictions, labels=labels,
+      weights=weights, metrics_collections=metrics_collections,
+      updates_collections=updates_collections, name=name)
 
 
 def _next_array_size(required_size, growth_factor=1.5):
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a97898d1f5e..0d305352077 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -39,6 +39,7 @@ py_library(
         ":platform",
         ":platform_test",
         ":summary",
+        ":metrics",
         ":layers",
         ":training",
         ":ops",
@@ -1312,6 +1313,39 @@ py_library(
     ],
 )
 
+py_library(
+    name = "confusion_matrix",
+    srcs = ["ops/confusion_matrix.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":framework",
+        ":math_ops",
+        ":sparse_ops",
+    ],
+)
+
+py_library(
+    name = "metrics",
+    srcs = ["ops/metrics.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":check_ops",
+        ":confusion_matrix",
+        ":control_flow_ops",
+        ":framework",
+        ":math_ops",
+        ":nn",
+        ":sets",
+        ":sparse_ops",
+        ":state_ops",
+        ":variable_scope",
+        ":variables",
+    ],
+)
+
 py_library(
     name = "special_math_ops",
     srcs = ["ops/special_math_ops.py"],
@@ -1334,6 +1368,7 @@ py_library(
         ":array_ops",
         ":check_ops",
         ":clip_ops",
+        ":confusion_matrix",
         ":control_flow_ops",
         ":data_flow_grad",
         ":data_flow_ops",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index e323c9b6a4d..8f94fb4c9dd 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -83,6 +83,7 @@ from tensorflow.python.ops.standard_ops import *
 
 # Bring in subpackages.
 from tensorflow.python.layers import layers
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import sdca_ops as sdca
@@ -118,6 +119,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import framework_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import confusion_matrix as confusion_matrix_m
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import histogram_ops
@@ -220,6 +222,7 @@ _allowed_symbols.extend([
     'image',
     'logging',
     'losses',
+    'metrics',
     'newaxis',
     'nn',
     'python_io',
@@ -246,10 +249,10 @@ _allowed_symbols.extend([
 # referenced in the whitelist.
 remove_undocumented(__name__, _allowed_symbols,
                     [framework_lib, array_ops, client_lib, check_ops,
-                     compat, constant_op, control_flow_ops, functional_ops,
-                     histogram_ops, io_ops, losses, math_ops, nn,
-                     resource_loader, resources, sets, script_ops, session_ops,
-                     sparse_ops, state_ops, string_ops, summary,
+                     compat, constant_op, control_flow_ops, confusion_matrix_m,
+                     functional_ops, histogram_ops, io_ops, losses, math_ops,
+                     metrics, nn, resource_loader, resources, sets, script_ops,
+                     session_ops, sparse_ops, state_ops, string_ops, summary,
                      tensor_array_ops, train, layers])
 
 # Special dunders that we choose to export:
diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py
index c2d97b3496d..3ede5403e1f 100644
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ b/tensorflow/python/framework/gen_docs_combined.py
@@ -260,6 +260,7 @@ EXCLUDE = frozenset(["tf.contrib.learn.monitors.NanLossDuringTrainingError",
                      "tf.contrib.framework.get_global_step",
                      "tf.contrib.learn.NanLossDuringTrainingError",
                      "tf.contrib.layers.stack",
+                     "tf.confusion_matrix",
                      "tf.nn.rnn_cell.RNNCell",
                      "tf.nn.rnn_cell.BasicRNNCell",
                      "tf.nn.rnn_cell.BasicLSTMCell",
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 3185b1fd064..e825e593250 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1385,6 +1385,21 @@ tf_py_test(
     additional_deps = ["//tensorflow:tensorflow_py"],
 )
 
+tf_py_test(
+    name = "metrics_test",
+    size = "small",
+    srcs = ["metrics_test.py"],
+    additional_deps = ["//tensorflow:tensorflow_py"],
+    shard_count = 3,
+)
+
+tf_py_test(
+    name = "confusion_matrix_test",
+    size = "small",
+    srcs = ["confusion_matrix_test.py"],
+    additional_deps = ["//tensorflow:tensorflow_py"],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/metrics/python/kernel_tests/confusion_matrix_ops_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
similarity index 88%
rename from tensorflow/contrib/metrics/python/kernel_tests/confusion_matrix_ops_test.py
rename to tensorflow/python/kernel_tests/confusion_matrix_test.py
index a81ef6f9a2a..ff1231de42a 100644
--- a/tensorflow/contrib/metrics/python/kernel_tests/confusion_matrix_ops_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -28,8 +28,8 @@ class ConfusionMatrixTest(tf.test.TestCase):
   def _testConfMatrix(self, predictions, labels, truth, weights=None):
     with self.test_session():
       dtype = predictions.dtype
-      ans = tf.contrib.metrics.confusion_matrix(
-          predictions, labels, dtype=dtype, weights=weights)
+      ans = tf.confusion_matrix(
+          labels, predictions, dtype=dtype, weights=weights)
       tf_ans = ans.eval()
       self.assertAllClose(tf_ans, truth, atol=1e-10)
       self.assertEqual(tf_ans.dtype, dtype)
@@ -69,8 +69,8 @@ class ConfusionMatrixTest(tf.test.TestCase):
       lab = tf.concat(0, [tf.zeros([20], dtype=tf_dtype),
                           tf.ones([20], dtype=tf_dtype)])
 
-      cm = tf.contrib.metrics.confusion_matrix(
-          data, lab, dtype=tf_dtype, num_classes=2)
+      cm = tf.confusion_matrix(
+          lab, data, dtype=tf_dtype, num_classes=2)
 
       d, l, cm_out = sess.run([data, lab, cm], {m_neg: 0.0,
                                                 m_pos: 1.0,
@@ -157,28 +157,28 @@ class ConfusionMatrixTest(tf.test.TestCase):
     predictions = np.asarray([[1, 2, 3]])
     labels = np.asarray([1, 2, 3])
     self.assertRaisesRegexp(ValueError, "an not squeeze dim",
-                            tf.contrib.metrics.confusion_matrix, predictions,
-                            labels)
+                            tf.confusion_matrix,
+                            predictions, labels)
 
     predictions = np.asarray([1, 2, 3])
     labels = np.asarray([[1, 2, 3]])
     self.assertRaisesRegexp(ValueError, "an not squeeze dim",
-                            tf.contrib.metrics.confusion_matrix, predictions,
-                            labels)
+                            tf.confusion_matrix,
+                            predictions, labels)
 
   def testInputDifferentSize(self):
     predictions = np.asarray([1, 2, 3])
     labels = np.asarray([1, 2])
     self.assertRaisesRegexp(ValueError, "must be equal",
-                            tf.contrib.metrics.confusion_matrix, predictions,
-                            labels)
+                            tf.confusion_matrix,
+                            predictions, labels)
 
   def testOutputIsInt32(self):
     predictions = np.arange(2)
     labels = np.arange(2)
     with self.test_session():
-      cm = tf.contrib.metrics.confusion_matrix(
-          predictions, labels, dtype=dtypes.int32)
+      cm = tf.confusion_matrix(
+          labels, predictions, dtype=dtypes.int32)
       tf_cm = cm.eval()
     self.assertEqual(tf_cm.dtype, np.int32)
 
@@ -186,8 +186,8 @@ class ConfusionMatrixTest(tf.test.TestCase):
     predictions = np.arange(2)
     labels = np.arange(2)
     with self.test_session():
-      cm = tf.contrib.metrics.confusion_matrix(
-          predictions, labels, dtype=dtypes.int64)
+      cm = tf.confusion_matrix(
+          labels, predictions, dtype=dtypes.int64)
       tf_cm = cm.eval()
     self.assertEqual(tf_cm.dtype, np.int64)
 
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
new file mode 100644
index 00000000000..28b1811805b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -0,0 +1,3360 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for metrics."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+from tensorflow.python.ops import metrics
+
+NAN = float('nan')
+
+
+def _enqueue_vector(sess, queue, values, shape=None):
+  if not shape:
+    shape = (1, len(values))
+  dtype = queue.dtypes[0]
+  sess.run(queue.enqueue(tf.constant(values, dtype=dtype, shape=shape)))
+
+
+def _binary_2d_label_to_sparse_value(labels):
+  """Convert dense 2D binary indicator tensor to sparse tensor.
+
+  Only 1 values in `labels` are included in result.
+
+  Args:
+    labels: Dense 2D binary indicator tensor.
+
+  Returns:
+    `SparseTensorValue` whose values are indices along the last dimension of
+    `labels`.
+  """
+  indices = []
+  values = []
+  batch = 0
+  for row in labels:
+    label = 0
+    xi = 0
+    for x in row:
+      if x == 1:
+        indices.append([batch, xi])
+        values.append(label)
+        xi += 1
+      else:
+        assert x == 0
+      label += 1
+    batch += 1
+  shape = [len(labels), len(labels[0])]
+  return tf.SparseTensorValue(
+      np.array(indices, np.int64),
+      np.array(values, np.int64),
+      np.array(shape, np.int64))
+
+
+def _binary_2d_label_to_sparse(labels):
+  """Convert dense 2D binary indicator tensor to sparse tensor.
+
+  Only 1 values in `labels` are included in result.
+
+  Args:
+    labels: Dense 2D binary indicator tensor.
+
+  Returns:
+    `SparseTensor` whose values are indices along the last dimension of
+    `labels`.
+  """
+  return tf.SparseTensor.from_value(_binary_2d_label_to_sparse_value(labels))
+
+
+def _binary_3d_label_to_sparse_value(labels):
+  """Convert dense 3D binary indicator tensor to sparse tensor.
+
+  Only 1 values in `labels` are included in result.
+
+  Args:
+    labels: Dense 2D binary indicator tensor.
+
+  Returns:
+    `SparseTensorValue` whose values are indices along the last dimension of
+    `labels`.
+  """
+  indices = []
+  values = []
+  for d0, labels_d0 in enumerate(labels):
+    for d1, labels_d1 in enumerate(labels_d0):
+      d2 = 0
+      for class_id, label in enumerate(labels_d1):
+        if label == 1:
+          values.append(class_id)
+          indices.append([d0, d1, d2])
+          d2 += 1
+        else:
+          assert label == 0
+  shape = [len(labels), len(labels[0]), len(labels[0][0])]
+  return tf.SparseTensorValue(
+      np.array(indices, np.int64),
+      np.array(values, np.int64),
+      np.array(shape, np.int64))
+
+
+def _binary_3d_label_to_sparse(labels):
+  """Convert dense 3D binary indicator tensor to sparse tensor.
+
+  Only 1 values in `labels` are included in result.
+
+  Args:
+    labels: Dense 2D binary indicator tensor.
+
+  Returns:
+    `SparseTensor` whose values are indices along the last dimension of
+    `labels`.
+  """
+  return tf.SparseTensor.from_value(_binary_3d_label_to_sparse_value(labels))
+
+
+def _assert_nan(test_case, actual):
+  test_case.assertTrue(math.isnan(actual), 'Expected NAN, got %s.' % actual)
+
+
+def _assert_local_variables(test_case, expected):
+  test_case.assertEquals(
+      set(expected), set(v.name for v in tf.local_variables()))
+
+
+class MeanTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean(tf.ones([4, 3]))
+    _assert_local_variables(self, ('mean/count:0', 'mean/total:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean(
+        tf.ones([4, 3]),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean(
+        tf.ones([4, 3]),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testBasic(self):
+    with self.test_session() as sess:
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      mean, update_op = metrics.mean(values)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+
+  def testUpdateOpsReturnsCurrentValue(self):
+    with self.test_session() as sess:
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      mean, update_op = metrics.mean(values)
+
+      sess.run(tf.local_variables_initializer())
+
+      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
+      self.assertAlmostEqual(1.475, sess.run(update_op), 5)
+      self.assertAlmostEqual(12.4/6.0, sess.run(update_op), 5)
+      self.assertAlmostEqual(1.65, sess.run(update_op), 5)
+
+      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+
+  def test1dWeightedValues(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [1])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean(values, weights)
+
+      tf.local_variables_initializer().run()
+      for _ in range(4):
+        update_op.eval()
+      self.assertAlmostEqual((0 + 1 - 3.2 + 4.0) / 4.0, mean.eval(), 5)
+
+  def test1dWeightedValues_placeholders(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      feed_values = (
+          (0, 1),
+          (-4.2, 9.1),
+          (6.5, 0),
+          (-3.2, 4.0)
+      )
+      values = tf.placeholder(dtype=tf.float32)
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [1])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean(values, weights)
+
+      tf.local_variables_initializer().run()
+      for i in range(4):
+        update_op.eval(feed_dict={values: feed_values[i]})
+      self.assertAlmostEqual((0 + 1 - 3.2 + 4.0) / 4.0, mean.eval(), 5)
+
+  def test2dWeightedValues(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, weights_queue, [1, 1])
+      _enqueue_vector(sess, weights_queue, [1, 0])
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean(values, weights)
+
+      tf.local_variables_initializer().run()
+      for _ in range(4):
+        update_op.eval()
+      self.assertAlmostEqual((0 + 1 - 4.2 + 0) / 4.0, mean.eval(), 5)
+
+  def test2dWeightedValues_placeholders(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      feed_values = (
+          (0, 1),
+          (-4.2, 9.1),
+          (6.5, 0),
+          (-3.2, 4.0)
+      )
+      values = tf.placeholder(dtype=tf.float32)
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, weights_queue, [1, 1])
+      _enqueue_vector(sess, weights_queue, [1, 0])
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean(values, weights)
+
+      tf.local_variables_initializer().run()
+      for i in range(4):
+        update_op.eval(feed_dict={values: feed_values[i]})
+      self.assertAlmostEqual((0 + 1 - 4.2 + 0) / 4.0, mean.eval(), 5)
+
+
+class StreamingMeanTensorTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_tensor(tf.ones([4, 3]))
+    _assert_local_variables(self, (
+        'mean/total_tensor:0', 'mean/count_tensor:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean_tensor(
+        tf.ones([4, 3]),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_tensor(
+        tf.ones([4, 3]),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testBasic(self):
+    with self.test_session() as sess:
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAllClose([[-0.9/4., 3.525]], sess.run(mean))
+
+  def testMultiDimensional(self):
+    with self.test_session() as sess:
+      values_queue = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(2, 2, 2))
+      _enqueue_vector(sess,
+                      values_queue,
+                      [[[1, 2], [1, 2]], [[1, 2], [1, 2]]],
+                      shape=(2, 2, 2))
+      _enqueue_vector(sess,
+                      values_queue,
+                      [[[1, 2], [1, 2]], [[3, 4], [9, 10]]],
+                      shape=(2, 2, 2))
+      values = values_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(2):
+        sess.run(update_op)
+      self.assertAllClose([[[1, 2], [1, 2]], [[2, 3], [5, 6]]],
+                          sess.run(mean))
+
+  def testUpdateOpsReturnsCurrentValue(self):
+    with self.test_session() as sess:
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values)
+
+      sess.run(tf.local_variables_initializer())
+
+      self.assertAllClose([[0, 1]], sess.run(update_op), 5)
+      self.assertAllClose([[-2.1, 5.05]], sess.run(update_op), 5)
+      self.assertAllClose([[2.3/3., 10.1/3.]], sess.run(update_op), 5)
+      self.assertAllClose([[-0.9/4., 3.525]], sess.run(update_op), 5)
+
+      self.assertAllClose([[-0.9/4., 3.525]], sess.run(mean), 5)
+
+  def testWeighted1d(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [[1]])
+      _enqueue_vector(sess, weights_queue, [[0]])
+      _enqueue_vector(sess, weights_queue, [[1]])
+      _enqueue_vector(sess, weights_queue, [[0]])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values, weights)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAllClose([[3.25, 0.5]], sess.run(mean), 5)
+
+  def testWeighted2d_1(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, weights_queue, [1, 1])
+      _enqueue_vector(sess, weights_queue, [1, 0])
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values, weights)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAllClose([[-2.1, 0.5]], sess.run(mean), 5)
+
+  def testWeighted2d_2(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.mean_tensor(values, weights)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAllClose([[0, 0.5]], sess.run(mean), 5)
+
+
+class AccuracyTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.accuracy(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)),
+        name='my_accuracy')
+    _assert_local_variables(self, (
+        'my_accuracy/count:0', 'my_accuracy/total:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.accuracy(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.accuracy(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
+    predictions = tf.ones((10, 3))
+    labels = tf.ones((10, 4))
+    with self.assertRaises(ValueError):
+      metrics.accuracy(labels, predictions)
+
+  def testPredictionsAndWeightsOfDifferentSizeRaisesValueError(self):
+    predictions = tf.ones((10, 3))
+    labels = tf.ones((10, 3))
+    weights = tf.ones((9, 3))
+    with self.assertRaises(ValueError):
+      metrics.accuracy(labels, predictions, weights)
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=3, dtype=tf.int64, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=3, dtype=tf.int64, seed=1)
+    accuracy, update_op = metrics.accuracy(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_accuracy = accuracy.eval()
+      for _ in range(10):
+        self.assertEqual(initial_accuracy, accuracy.eval())
+
+  def testMultipleUpdates(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [2])
+      _enqueue_vector(sess, preds_queue, [1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [2])
+      labels = labels_queue.dequeue()
+
+      accuracy, update_op = metrics.accuracy(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in xrange(3):
+        sess.run(update_op)
+      self.assertEqual(0.5, sess.run(update_op))
+      self.assertEqual(0.5, accuracy.eval())
+
+  def testEffectivelyEquivalentSizes(self):
+    predictions = tf.ones((40, 1))
+    labels = tf.ones((40,))
+    with self.test_session() as sess:
+      accuracy, update_op = metrics.accuracy(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(1.0, update_op.eval())
+      self.assertEqual(1.0, accuracy.eval())
+
+  def testEffectivelyEquivalentSizesWithStaicShapedWeight(self):
+    predictions = tf.convert_to_tensor([1, 1, 1])  # shape 3,
+    labels = tf.expand_dims(tf.convert_to_tensor([1, 0, 0]), 1)  # shape 3, 1
+    weights = tf.expand_dims(tf.convert_to_tensor([100, 1, 1]), 1)  # shape 3, 1
+
+    with self.test_session() as sess:
+      accuracy, update_op = metrics.accuracy(
+          labels, predictions, weights)
+
+      sess.run(tf.local_variables_initializer())
+      # if streaming_accuracy does not flatten the weight, accuracy would be
+      # 0.33333334 due to an intended broadcast of weight. Due to flattening,
+      # it will be higher than .95
+      self.assertGreater(update_op.eval(), .95)
+      self.assertGreater(accuracy.eval(), .95)
+
+  def testEffectivelyEquivalentSizesWithDynamicallyShapedWeight(self):
+    predictions = tf.convert_to_tensor([1, 1, 1])  # shape 3,
+    labels = tf.expand_dims(tf.convert_to_tensor([1, 0, 0]), 1)  # shape 3, 1
+
+    weights = [[100], [1], [1]]  # shape 3, 1
+    weights_placeholder = tf.placeholder(dtype=tf.int32, name='weights')
+    feed_dict = {weights_placeholder: weights}
+
+    with self.test_session() as sess:
+      accuracy, update_op = metrics.accuracy(
+          labels, predictions, weights_placeholder)
+
+      sess.run(tf.local_variables_initializer())
+      # if streaming_accuracy does not flatten the weight, accuracy would be
+      # 0.33333334 due to an intended broadcast of weight. Due to flattening,
+      # it will be higher than .95
+      self.assertGreater(update_op.eval(feed_dict=feed_dict), .95)
+      self.assertGreater(accuracy.eval(feed_dict=feed_dict), .95)
+
+  def testMultipleUpdatesWithWeightedValues(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [2])
+      _enqueue_vector(sess, preds_queue, [1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [2])
+      labels = labels_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.int64, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1])
+      _enqueue_vector(sess, weights_queue, [1])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [0])
+      weights = weights_queue.dequeue()
+
+      accuracy, update_op = metrics.accuracy(
+          labels, predictions, weights)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in xrange(3):
+        sess.run(update_op)
+      self.assertEqual(1.0, sess.run(update_op))
+      self.assertEqual(1.0, accuracy.eval())
+
+
+class PrecisionTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.precision(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'precision/false_positives/count:0',
+        'precision/true_positives/count:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.precision(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.precision(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    precision, update_op = metrics.precision(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_precision = precision.eval()
+      for _ in range(10):
+        self.assertEqual(initial_precision, precision.eval())
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(inputs)
+    labels = tf.constant(inputs)
+    precision, update_op = metrics.precision(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1, sess.run(update_op))
+      self.assertAlmostEqual(1, precision.eval())
+
+  def testSomeCorrect(self):
+    predictions = tf.constant([1, 0, 1, 0], shape=(1, 4))
+    labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+    precision, update_op = metrics.precision(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.5, update_op.eval())
+      self.assertAlmostEqual(0.5, precision.eval())
+
+  def testWeighted1d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    precision, update_op = metrics.precision(
+        labels, predictions, weights=tf.constant([[2], [5]]))
+
+    with self.test_session():
+      tf.local_variables_initializer().run()
+      weighted_tp = 2.0 + 5.0
+      weighted_positives = (2.0 + 2.0) + (5.0 + 5.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, precision.eval())
+
+  def testWeighted1d_placeholders(self):
+    predictions = tf.placeholder(dtype=tf.float32)
+    labels = tf.placeholder(dtype=tf.float32)
+    feed_dict = {
+        predictions: ((1, 0, 1, 0), (1, 0, 1, 0)),
+        labels: ((0, 1, 1, 0), (1, 0, 0, 1))
+    }
+    precision, update_op = metrics.precision(
+        labels, predictions, weights=tf.constant([[2], [5]]))
+
+    with self.test_session():
+      tf.local_variables_initializer().run()
+      weighted_tp = 2.0 + 5.0
+      weighted_positives = (2.0 + 2.0) + (5.0 + 5.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(
+          expected_precision, update_op.eval(feed_dict=feed_dict))
+      self.assertAlmostEqual(
+          expected_precision, precision.eval(feed_dict=feed_dict))
+
+  def testWeighted2d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    precision, update_op = metrics.precision(
+        labels, predictions, weights=tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+
+    with self.test_session():
+      tf.local_variables_initializer().run()
+      weighted_tp = 3.0 + 4.0
+      weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, precision.eval())
+
+  def testWeighted2d_placeholders(self):
+    predictions = tf.placeholder(dtype=tf.float32)
+    labels = tf.placeholder(dtype=tf.float32)
+    feed_dict = {
+        predictions: ((1, 0, 1, 0), (1, 0, 1, 0)),
+        labels: ((0, 1, 1, 0), (1, 0, 0, 1))
+    }
+    precision, update_op = metrics.precision(
+        labels, predictions, weights=tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+
+    with self.test_session():
+      tf.local_variables_initializer().run()
+      weighted_tp = 3.0 + 4.0
+      weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(
+          expected_precision, update_op.eval(feed_dict=feed_dict))
+      self.assertAlmostEqual(
+          expected_precision, precision.eval(feed_dict=feed_dict))
+
+  def testAllIncorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(inputs)
+    labels = tf.constant(1 - inputs)
+    precision, update_op = metrics.precision(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertAlmostEqual(0, precision.eval())
+
+  def testZeroTrueAndFalsePositivesGivesZeroPrecision(self):
+    predictions = tf.constant([0, 0, 0, 0])
+    labels = tf.constant([0, 0, 0, 0])
+    precision, update_op = metrics.precision(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(0.0, precision.eval())
+
+
+class StreamingRecallTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.recall(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'recall/false_negatives/count:0',
+        'recall/true_positives/count:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.recall(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.recall(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    recall, update_op = metrics.recall(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_recall = recall.eval()
+      for _ in range(10):
+        self.assertEqual(initial_recall, recall.eval())
+
+  def testAllCorrect(self):
+    np_inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(np_inputs)
+    labels = tf.constant(np_inputs)
+    recall, update_op = metrics.recall(labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(1, recall.eval())
+
+  def testSomeCorrect(self):
+    predictions = tf.constant([1, 0, 1, 0], shape=(1, 4))
+    labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+    recall, update_op = metrics.recall(labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.5, update_op.eval())
+      self.assertAlmostEqual(0.5, recall.eval())
+
+  def testWeighted1d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    weights = tf.constant([[2], [5]])
+    recall, update_op = metrics.recall(
+        labels, predictions, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      weighted_tp = 2.0 + 5.0
+      weighted_t = (2.0 + 2.0) + (5.0 + 5.0)
+      expected_precision = weighted_tp / weighted_t
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, recall.eval())
+
+  def testWeighted2d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    weights = tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]])
+    recall, update_op = metrics.recall(
+        labels, predictions, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      weighted_tp = 3.0 + 1.0
+      weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
+      expected_precision = weighted_tp / weighted_t
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, recall.eval())
+
+  def testAllIncorrect(self):
+    np_inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(np_inputs)
+    labels = tf.constant(1 - np_inputs)
+    recall, update_op = metrics.recall(labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(0, recall.eval())
+
+  def testZeroTruePositivesAndFalseNegativesGivesZeroRecall(self):
+    predictions = tf.zeros((1, 4))
+    labels = tf.zeros((1, 4))
+    recall, update_op = metrics.recall(labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(0, recall.eval())
+
+
+class StreamingAUCTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.auc(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'auc/true_positives:0',
+        'auc/false_negatives:0',
+        'auc/false_positives:0',
+        'auc/true_negatives:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.auc(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.auc(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    auc, update_op = metrics.auc(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_auc = auc.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_auc, auc.eval(), 5)
+
+  def testAllCorrect(self):
+    self.allCorrectAsExpected('ROC')
+
+  def allCorrectAsExpected(self, curve):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    with self.test_session() as sess:
+      predictions = tf.constant(inputs, dtype=tf.float32)
+      labels = tf.constant(inputs)
+      auc, update_op = metrics.auc(labels, predictions, curve=curve)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(1, sess.run(update_op))
+
+      self.assertEqual(1, auc.eval())
+
+  def testSomeCorrect(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+      auc, update_op = metrics.auc(labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.5, sess.run(update_op))
+
+      self.assertAlmostEqual(0.5, auc.eval())
+
+  def testWeighted1d(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+      weights = tf.constant([2], shape=(1, 1))
+      auc, update_op = metrics.auc(labels,
+                                   predictions, weights=weights)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
+
+      self.assertAlmostEqual(0.5, auc.eval(), 5)
+
+  def testWeighted2d(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+      weights = tf.constant([1, 2, 3, 4], shape=(1, 4))
+      auc, update_op = metrics.auc(labels,
+                                   predictions, weights=weights)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.7, sess.run(update_op), 5)
+
+      self.assertAlmostEqual(0.7, auc.eval(), 5)
+
+  def testAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([0.1, 0.4, 0.35, 0.8],
+                                shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 0, 1, 1], shape=(1, 4))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
+
+      self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
+
+  def testAnotherAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81],
+                                shape=(1, 7), dtype=tf.float32)
+      labels = tf.constant([0, 0, 1, 0, 1, 0, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
+
+      self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
+
+  def testThirdAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
+                                shape=(1, 7), dtype=tf.float32)
+      labels = tf.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
+
+      self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-3)
+
+  def testAllIncorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    with self.test_session() as sess:
+      predictions = tf.constant(inputs, dtype=tf.float32)
+      labels = tf.constant(1 - inputs, dtype=tf.float32)
+      auc, update_op = metrics.auc(labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0, sess.run(update_op))
+
+      self.assertAlmostEqual(0, auc.eval())
+
+  def testZeroTruePositivesAndFalseNegativesGivesOneAUC(self):
+    with self.test_session() as sess:
+      predictions = tf.zeros([4], dtype=tf.float32)
+      labels = tf.zeros([4])
+      auc, update_op = metrics.auc(labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1, sess.run(update_op), 6)
+
+      self.assertAlmostEqual(1, auc.eval(), 6)
+
+  def testRecallOneAndPrecisionOneGivesOnePRAUC(self):
+    with self.test_session() as sess:
+      predictions = tf.ones([4], dtype=tf.float32)
+      labels = tf.ones([4])
+      auc, update_op = metrics.auc(labels,
+                                   predictions,
+                                   curve='PR')
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1, sess.run(update_op), 6)
+
+      self.assertAlmostEqual(1, auc.eval(), 6)
+
+  def np_auc(self, predictions, labels, weights):
+    """Computes the AUC explicitely using Numpy.
+
+    Args:
+      predictions: an ndarray with shape [N].
+      labels: an ndarray with shape [N].
+      weights: an ndarray with shape [N].
+
+    Returns:
+      the area under the ROC curve.
+    """
+    if weights is None:
+      weights = np.ones(np.size(predictions))
+    is_positive = labels > 0
+    num_positives = np.sum(weights[is_positive])
+    num_negatives = np.sum(weights[~is_positive])
+
+    # Sort descending:
+    inds = np.argsort(-predictions)
+
+    sorted_labels = labels[inds]
+    sorted_weights = weights[inds]
+    is_positive = sorted_labels > 0
+
+    tp = np.cumsum(sorted_weights * is_positive) / num_positives
+    return np.sum((sorted_weights * tp)[~is_positive]) / num_negatives
+
+  def testWithMultipleUpdates(self):
+    num_samples = 1000
+    batch_size = 10
+    num_batches = int(num_samples / batch_size)
+
+    # Create the labels and data.
+    labels = np.random.randint(0, 2, size=num_samples)
+    noise = np.random.normal(0.0, scale=0.2, size=num_samples)
+    predictions = 0.4 + 0.2 * labels + noise
+    predictions[predictions > 1] = 1
+    predictions[predictions < 0] = 0
+
+    def _enqueue_as_batches(x, enqueue_ops):
+      x_batches = x.astype(np.float32).reshape((num_batches, batch_size))
+      x_queue = tf.FIFOQueue(num_batches, dtypes=tf.float32,
+                             shapes=(batch_size,))
+      for i in range(num_batches):
+        enqueue_ops[i].append(x_queue.enqueue(x_batches[i, :]))
+      return x_queue.dequeue()
+
+    for weights in (None,
+                    np.ones(num_samples),
+                    np.random.exponential(scale=1.0, size=num_samples)):
+      expected_auc = self.np_auc(predictions, labels, weights)
+
+      with self.test_session() as sess:
+        enqueue_ops = [[] for i in range(num_batches)]
+        tf_predictions = _enqueue_as_batches(predictions, enqueue_ops)
+        tf_labels = _enqueue_as_batches(labels, enqueue_ops)
+        tf_weights = (_enqueue_as_batches(weights, enqueue_ops)
+                      if weights is not None else None)
+
+        for i in range(num_batches):
+          sess.run(enqueue_ops[i])
+
+        auc, update_op = metrics.auc(
+            tf_labels, tf_predictions, curve='ROC', num_thresholds=500,
+            weights=tf_weights)
+
+        sess.run(tf.local_variables_initializer())
+        for i in range(num_batches):
+          sess.run(update_op)
+
+        # Since this is only approximate, we can't expect a 6 digits match.
+        # Although with higher number of samples/thresholds we should see the
+        # accuracy improving
+        self.assertAlmostEqual(expected_auc, auc.eval(), 2)
+
+
+class SpecificityAtSensitivityTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.specificity_at_sensitivity(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)), sensitivity=0.7)
+    _assert_local_variables(self, (
+        'specificity_at_sensitivity/true_positives:0',
+        'specificity_at_sensitivity/false_negatives:0',
+        'specificity_at_sensitivity/false_positives:0',
+        'specificity_at_sensitivity/true_negatives:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.specificity_at_sensitivity(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        sensitivity=0.7,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.specificity_at_sensitivity(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        sensitivity=0.7,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=2, dtype=tf.int64, seed=1)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, sensitivity=0.7)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_specificity = specificity.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_specificity, specificity.eval(), 5)
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(inputs, dtype=tf.float32)
+    labels = tf.constant(inputs)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, sensitivity=0.7)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(1, sess.run(update_op))
+      self.assertEqual(1, specificity.eval())
+
+  def testSomeCorrectHighSensitivity(self):
+    predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0,
+                          0.1, 0.45, 0.5, 0.8, 0.9]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, sensitivity=0.8)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1.0, sess.run(update_op))
+      self.assertAlmostEqual(1.0, specificity.eval())
+
+  def testSomeCorrectLowSensitivity(self):
+    predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0,
+                          0.1, 0.2, 0.2, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, sensitivity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, specificity.eval())
+
+  def testWeighted1d(self):
+    predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0,
+                          0.1, 0.2, 0.2, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weights_values = [3]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    weights = tf.constant(weights_values)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, weights=weights, sensitivity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, specificity.eval())
+
+  def testWeighted2d(self):
+    predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0,
+                          0.1, 0.2, 0.2, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weights_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    weights = tf.constant(weights_values)
+    specificity, update_op = metrics.specificity_at_sensitivity(
+        labels, predictions, weights=weights, sensitivity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      self.assertAlmostEqual(8.0 / 15.0, sess.run(update_op))
+      self.assertAlmostEqual(8.0 / 15.0, specificity.eval())
+
+
+class StreamingSensitivityAtSpecificityTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.sensitivity_at_specificity(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)), specificity=0.7)
+    _assert_local_variables(self, (
+        'sensitivity_at_specificity/true_positives:0',
+        'sensitivity_at_specificity/false_negatives:0',
+        'sensitivity_at_specificity/false_positives:0',
+        'sensitivity_at_specificity/true_negatives:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.sensitivity_at_specificity(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        specificity=0.7,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.sensitivity_at_specificity(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        specificity=0.7,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=2, dtype=tf.int64, seed=1)
+    sensitivity, update_op = metrics.sensitivity_at_specificity(
+        labels, predictions, specificity=0.7)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_sensitivity = sensitivity.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_sensitivity, sensitivity.eval(), 5)
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = tf.constant(inputs, dtype=tf.float32)
+    labels = tf.constant(inputs)
+    specificity, update_op = metrics.sensitivity_at_specificity(
+        labels, predictions, specificity=0.7)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(1, sess.run(update_op))
+      self.assertEqual(1, specificity.eval())
+
+  def testSomeCorrectHighSpecificity(self):
+    predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4,
+                          0.1, 0.45, 0.5, 0.8, 0.9]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    specificity, update_op = metrics.sensitivity_at_specificity(
+        labels, predictions, specificity=0.8)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.8, sess.run(update_op))
+      self.assertAlmostEqual(0.8, specificity.eval())
+
+  def testSomeCorrectLowSpecificity(self):
+    predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4,
+                          0.01, 0.02, 0.25, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    specificity, update_op = metrics.sensitivity_at_specificity(
+        labels, predictions, specificity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, specificity.eval())
+
+  def testWeighted(self):
+    predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4,
+                          0.01, 0.02, 0.25, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weights_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    weights = tf.constant(weights_values)
+    specificity, update_op = metrics.sensitivity_at_specificity(
+        labels, predictions, weights=weights, specificity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(0.675, sess.run(update_op))
+      self.assertAlmostEqual(0.675, specificity.eval())
+
+
+# TODO(nsilberman): Break this up into two sets of tests.
+class StreamingPrecisionRecallThresholdsTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.precision_at_thresholds(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)),
+        thresholds=[0, 0.5, 1.0])
+    _assert_local_variables(self, (
+        'precision_at_thresholds/true_positives:0',
+        'precision_at_thresholds/false_positives:0',
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    prec, _ = metrics.precision_at_thresholds(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        thresholds=[0, 0.5, 1.0],
+        metrics_collections=[my_collection_name])
+    rec, _ = metrics.recall_at_thresholds(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        thresholds=[0, 0.5, 1.0],
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [prec, rec])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, precision_op = metrics.precision_at_thresholds(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        thresholds=[0, 0.5, 1.0],
+        updates_collections=[my_collection_name])
+    _, recall_op = metrics.recall_at_thresholds(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        thresholds=[0, 0.5, 1.0],
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name),
+                         [precision_op, recall_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_uniform((10, 3), maxval=1, dtype=tf.float32, seed=1)
+    labels = tf.random_uniform((10, 3), maxval=1, dtype=tf.int64, seed=1)
+    thresholds = [0, 0.5, 1.0]
+    prec, prec_op = metrics.precision_at_thresholds(
+        labels, predictions, thresholds)
+    rec, rec_op = metrics.recall_at_thresholds(
+        labels, predictions, thresholds)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates, then verify idempotency.
+      sess.run([prec_op, rec_op])
+      initial_prec = prec.eval()
+      initial_rec = rec.eval()
+      for _ in range(10):
+        sess.run([prec_op, rec_op])
+        self.assertAllClose(initial_prec, prec.eval())
+        self.assertAllClose(initial_rec, rec.eval())
+
+  # TODO(nsilberman): fix tests (passing but incorrect).
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    with self.test_session() as sess:
+      predictions = tf.constant(inputs, dtype=tf.float32)
+      labels = tf.constant(inputs)
+      thresholds = [0.5]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertEqual(1, prec.eval())
+      self.assertEqual(1, rec.eval())
+
+  def testSomeCorrect(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+      thresholds = [0.5]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(0.5, prec.eval())
+      self.assertAlmostEqual(0.5, rec.eval())
+
+  def testAllIncorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    with self.test_session() as sess:
+      predictions = tf.constant(inputs, dtype=tf.float32)
+      labels = tf.constant(1 - inputs, dtype=tf.float32)
+      thresholds = [0.5]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(0, prec.eval())
+      self.assertAlmostEqual(0, rec.eval())
+
+  def testWeights1d(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([[1, 0], [1, 0]], shape=(2, 2),
+                                dtype=tf.float32)
+      labels = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
+      weights = tf.constant([[0], [1]], shape=(2, 1), dtype=tf.float32)
+      thresholds = [0.5, 1.1]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds, weights=weights)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds, weights=weights)
+
+      [prec_low, prec_high] = tf.split(0, 2, prec)
+      prec_low = tf.reshape(prec_low, shape=())
+      prec_high = tf.reshape(prec_high, shape=())
+      [rec_low, rec_high] = tf.split(0, 2, rec)
+      rec_low = tf.reshape(rec_low, shape=())
+      rec_high = tf.reshape(rec_high, shape=())
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
+      self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
+      self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
+      self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
+
+  def testWeights2d(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([[1, 0], [1, 0]], shape=(2, 2),
+                                dtype=tf.float32)
+      labels = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
+      weights = tf.constant([[0, 0], [1, 1]], shape=(2, 2), dtype=tf.float32)
+      thresholds = [0.5, 1.1]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds, weights=weights)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds, weights=weights)
+
+      [prec_low, prec_high] = tf.split(0, 2, prec)
+      prec_low = tf.reshape(prec_low, shape=())
+      prec_high = tf.reshape(prec_high, shape=())
+      [rec_low, rec_high] = tf.split(0, 2, rec)
+      rec_low = tf.reshape(rec_low, shape=())
+      rec_high = tf.reshape(rec_high, shape=())
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
+      self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
+      self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
+      self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
+
+  def testExtremeThresholds(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 1], shape=(1, 4))
+      thresholds = [-1.0, 2.0]  # lower/higher than any values
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds)
+
+      [prec_low, prec_high] = tf.split(0, 2, prec)
+      [rec_low, rec_high] = tf.split(0, 2, rec)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(0.75, prec_low.eval())
+      self.assertAlmostEqual(0.0, prec_high.eval())
+      self.assertAlmostEqual(1.0, rec_low.eval())
+      self.assertAlmostEqual(0.0, rec_high.eval())
+
+  def testZeroLabelsPredictions(self):
+    with self.test_session() as sess:
+      predictions = tf.zeros([4], dtype=tf.float32)
+      labels = tf.zeros([4])
+      thresholds = [0.5]
+      prec, prec_op = metrics.precision_at_thresholds(
+          labels, predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          labels, predictions, thresholds)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(0, prec.eval(), 6)
+      self.assertAlmostEqual(0, rec.eval(), 6)
+
+  def testWithMultipleUpdates(self):
+    num_samples = 1000
+    batch_size = 10
+    num_batches = int(num_samples / batch_size)
+
+    # Create the labels and data.
+    labels = np.random.randint(0, 2, size=(num_samples, 1))
+    noise = np.random.normal(0.0, scale=0.2, size=(num_samples, 1))
+    predictions = 0.4 + 0.2 * labels + noise
+    predictions[predictions > 1] = 1
+    predictions[predictions < 0] = 0
+    thresholds = [0.3]
+
+    tp = 0
+    fp = 0
+    fn = 0
+    tn = 0
+    for i in range(num_samples):
+      if predictions[i] > thresholds[0]:
+        if labels[i] == 1:
+          tp += 1
+        else:
+          fp += 1
+      else:
+        if labels[i] == 1:
+          fn += 1
+        else:
+          tn += 1
+    epsilon = 1e-7
+    expected_prec = tp / (epsilon + tp + fp)
+    expected_rec = tp / (epsilon + tp + fn)
+
+    labels = labels.astype(np.float32)
+    predictions = predictions.astype(np.float32)
+
+    with self.test_session() as sess:
+      # Reshape the data so its easy to queue up:
+      predictions_batches = predictions.reshape((batch_size, num_batches))
+      labels_batches = labels.reshape((batch_size, num_batches))
+
+      # Enqueue the data:
+      predictions_queue = tf.FIFOQueue(num_batches, dtypes=tf.float32,
+                                       shapes=(batch_size,))
+      labels_queue = tf.FIFOQueue(num_batches, dtypes=tf.float32,
+                                  shapes=(batch_size,))
+
+      for i in range(int(num_batches)):
+        tf_prediction = tf.constant(predictions_batches[:, i])
+        tf_label = tf.constant(labels_batches[:, i])
+        sess.run([predictions_queue.enqueue(tf_prediction),
+                  labels_queue.enqueue(tf_label)])
+
+      tf_predictions = predictions_queue.dequeue()
+      tf_labels = labels_queue.dequeue()
+
+      prec, prec_op = metrics.precision_at_thresholds(
+          tf_labels, tf_predictions, thresholds)
+      rec, rec_op = metrics.recall_at_thresholds(
+          tf_labels, tf_predictions, thresholds)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(int(num_samples / batch_size)):
+        sess.run([prec_op, rec_op])
+      # Since this is only approximate, we can't expect a 6 digits match.
+      # Although with higher number of samples/thresholds we should see the
+      # accuracy improving
+      self.assertAlmostEqual(expected_prec, prec.eval(), 2)
+      self.assertAlmostEqual(expected_rec, rec.eval(), 2)
+
+
+class StreamingSparsePrecisionTest(tf.test.TestCase):
+
+  def _test_streaming_sparse_precision_at_k(self,
+                                            predictions,
+                                            labels,
+                                            k,
+                                            expected,
+                                            class_id=None,
+                                            weights=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = tf.constant(weights, tf.float32)
+      metric, update = metrics.sparse_precision_at_k(
+          predictions=tf.constant(predictions, tf.float32), labels=labels,
+          k=k, class_id=class_id, weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(tf.OpError, metric.eval)
+      self.assertRaises(tf.OpError, update.eval)
+      tf.initialize_variables(tf.local_variables()).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        _assert_nan(self, update.eval())
+        _assert_nan(self, metric.eval())
+      else:
+        self.assertEqual(expected, update.eval())
+        self.assertEqual(expected, metric.eval())
+
+  def _test_streaming_sparse_average_precision_at_k(
+      self, predictions, labels, k, expected, weights=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = tf.constant(weights, tf.float32)
+      predictions = tf.constant(predictions, tf.float32)
+      metric, update = metrics.sparse_average_precision_at_k(
+          labels, predictions, k, weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(tf.OpError, metric.eval)
+      self.assertRaises(tf.OpError, update.eval)
+      local_variables = tf.local_variables()
+      tf.initialize_variables(local_variables).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        _assert_nan(self, update.eval())
+        _assert_nan(self, metric.eval())
+      else:
+        self.assertAlmostEqual(expected, update.eval())
+        self.assertAlmostEqual(expected, metric.eval())
+
+  def test_average_precision(self):
+    # Example 1.
+    # Matches example here:
+    # fastml.com/what-you-wanted-to-know-about-mean-average-precision
+    labels_ex1 = (0, 1, 2, 3, 4)
+    labels = np.array([labels_ex1], dtype=np.int64)
+    predictions_ex1 = (0.2, 0.1, 0.0, 0.4, 0.0, 0.5, 0.3)
+    predictions = (predictions_ex1,)
+    precision_ex1 = (
+        0.0 / 1,
+        1.0 / 2,
+        1.0 / 3,
+        2.0 / 4
+    )
+    avg_precision_ex1 = (
+        0.0 / 1,
+        precision_ex1[1] / 2,
+        precision_ex1[1] / 3,
+        (precision_ex1[1] + precision_ex1[3]) / 4
+    )
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=precision_ex1[i])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=avg_precision_ex1[i])
+
+    # Example 2.
+    labels_ex2 = (0, 2, 4, 5, 6)
+    labels = np.array([labels_ex2], dtype=np.int64)
+    predictions_ex2 = (0.3, 0.5, 0.0, 0.4, 0.0, 0.1, 0.2)
+    predictions = (predictions_ex2,)
+    precision_ex2 = (
+        0.0 / 1,
+        0.0 / 2,
+        1.0 / 3,
+        2.0 / 4
+    )
+    avg_precision_ex2 = (
+        0.0 / 1,
+        0.0 / 2,
+        precision_ex2[2] / 3,
+        (precision_ex2[2] + precision_ex2[3]) / 4
+    )
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=precision_ex2[i])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=avg_precision_ex2[i])
+
+    # Both examples, we expect both precision and average precision to be the
+    # average of the 2 examples.
+    labels = np.array([labels_ex1, labels_ex2], dtype=np.int64)
+    predictions = (predictions_ex1, predictions_ex2)
+    streaming_precision = [
+        (ex1 + ex2) / 2
+        for ex1, ex2 in zip(precision_ex1, precision_ex2)]
+    streaming_average_precision = [
+        (ex1 + ex2) / 2
+        for ex1, ex2 in zip(avg_precision_ex1, avg_precision_ex2)]
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=streaming_precision[i])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=streaming_average_precision[i])
+
+    # Weighted examples, we expect streaming average precision to be the
+    # weighted average of the 2 examples.
+    weights = (0.3, 0.6)
+    streaming_average_precision = [
+        (weights[0] * ex1 + weights[1] * ex2) / (weights[0] + weights[1])
+        for ex1, ex2 in zip(avg_precision_ex1, avg_precision_ex2)]
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=streaming_average_precision[i],
+          weights=weights)
+
+  def test_average_precision_some_labels_out_of_range(self):
+    """Tests that labels outside the [0, n_classes) range are ignored."""
+    labels_ex1 = (-1, 0, 1, 2, 3, 4, 7)
+    labels = np.array([labels_ex1], dtype=np.int64)
+    predictions_ex1 = (0.2, 0.1, 0.0, 0.4, 0.0, 0.5, 0.3)
+    predictions = (predictions_ex1,)
+    precision_ex1 = (
+        0.0 / 1,
+        1.0 / 2,
+        1.0 / 3,
+        2.0 / 4
+    )
+    avg_precision_ex1 = (
+        0.0 / 1,
+        precision_ex1[1] / 2,
+        precision_ex1[1] / 3,
+        (precision_ex1[1] + precision_ex1[3]) / 4
+    )
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=precision_ex1[i])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=avg_precision_ex1[i])
+
+  def test_one_label_at_k1_nan(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 0,1,2 have 0 predictions, classes -1 and 4 are out of range.
+      for class_id in (-1, 0, 1, 2, 4):
+        self._test_streaming_sparse_precision_at_k(
+            predictions, labels, k=1, expected=NAN, class_id=class_id)
+
+  def test_one_label_at_k1(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 3: 1 label, 2 predictions, 1 correct.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=1, expected=1.0 / 2, class_id=3)
+
+      # All classes: 2 labels, 2 predictions, 1 correct.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=1, expected=1.0 / 2)
+
+  def test_three_labels_at_k5_no_predictions(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 1,3,8 have 0 predictions, classes -1 and 10 are out of range.
+      for class_id in (-1, 1, 3, 8, 10):
+        self._test_streaming_sparse_precision_at_k(
+            predictions, labels, k=5, expected=NAN, class_id=class_id)
+
+  def test_three_labels_at_k5_no_labels(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 0,4,6,9: 0 labels, >=1 prediction.
+      for class_id in (0, 4, 6, 9):
+        self._test_streaming_sparse_precision_at_k(
+            predictions, labels, k=5, expected=0.0, class_id=class_id)
+
+  def test_three_labels_at_k5(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 2: 2 labels, 2 correct predictions.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=2.0 / 2,
+          class_id=2)
+
+      # Class 5: 1 label, 1 correct prediction.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=1.0 / 1, class_id=5)
+
+      # Class 7: 1 label, 1 incorrect prediction.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=0.0 / 1, class_id=7)
+
+      # All classes: 10 predictions, 3 correct.
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=3.0 / 10)
+
+  def test_three_labels_at_k5_some_out_of_range(self):
+    """Tests that labels outside the [0, n_classes) range are ignored."""
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ]
+    sp_labels = tf.SparseTensorValue(
+        indices=[[0, 0], [0, 1], [0, 2], [0, 3],
+                 [1, 0], [1, 1], [1, 2], [1, 3]],
+        # values -1 and 10 are outside the [0, n_classes) range and are ignored.
+        values=np.array([2, 7, -1, 8,
+                         1, 2, 5, 10], np.int64),
+        shape=[2, 4])
+
+    # Class 2: 2 labels, 2 correct predictions.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=2.0 / 2, class_id=2)
+
+    # Class 5: 1 label, 1 correct prediction.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=1.0 / 1, class_id=5)
+
+    # Class 7: 1 label, 1 incorrect prediction.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=0.0 / 1, class_id=7)
+
+    # All classes: 10 predictions, 3 correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=3.0 / 10)
+
+  def test_3d_nan(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Classes 1,3,8 have 0 predictions, classes -1 and 10 are out of range.
+    for class_id in (-1, 1, 3, 8, 10):
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=NAN, class_id=class_id)
+
+  def test_3d_no_labels(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Classes 0,4,6,9: 0 labels, >=1 prediction.
+    for class_id in (0, 4, 6, 9):
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k=5, expected=0.0, class_id=class_id)
+
+  def test_3d(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Class 2: 4 predictions, all correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=4.0 / 4, class_id=2)
+
+    # Class 5: 2 predictions, both correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=2.0 / 2, class_id=5)
+
+    # Class 7: 2 predictions, 1 correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=1.0 / 2, class_id=7)
+
+    # All classes: 20 predictions, 7 correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=7.0 / 20)
+
+  def test_3d_ignore_some(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Class 2: 2 predictions, both correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
+        weights=[[1], [0]])
+
+    # Class 2: 2 predictions, both correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
+        weights=[[0], [1]])
+
+    # Class 7: 1 incorrect prediction.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=0.0 / 1.0, class_id=7,
+        weights=[[1], [0]])
+
+    # Class 7: 1 correct prediction.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=1.0 / 1.0, class_id=7,
+        weights=[[0], [1]])
+
+    # Class 7: no predictions.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=NAN, class_id=7,
+        weights=[[1, 0], [0, 1]])
+
+    # Class 7: 2 predictions, 1 correct.
+    self._test_streaming_sparse_precision_at_k(
+        predictions, labels, k=5, expected=1.0 / 2.0, class_id=7,
+        weights=[[0, 1], [1, 0]])
+
+  def test_sparse_tensor_value(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    labels = [[0, 0, 0, 1], [0, 0, 1, 0]]
+    expected_precision = 0.5
+    with self.test_session():
+      _, precision = metrics.sparse_precision_at_k(
+          predictions=tf.constant(predictions, tf.float32),
+          labels=_binary_2d_label_to_sparse_value(labels), k=1)
+
+      tf.initialize_variables(tf.local_variables()).run()
+
+      self.assertEqual(expected_precision, precision.eval())
+
+
+class RecallAtkTest(tf.test.TestCase):
+
+  def _test_streaming_sparse_recall_at_k(self,
+                                         predictions,
+                                         labels,
+                                         k,
+                                         expected,
+                                         class_id=None,
+                                         weights=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = tf.constant(weights, tf.float32)
+      metric, update = metrics.recall_at_k(
+          predictions=tf.constant(predictions, tf.float32),
+          labels=labels, k=k, class_id=class_id, weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(tf.OpError, metric.eval)
+      self.assertRaises(tf.OpError, update.eval)
+      tf.initialize_variables(tf.local_variables()).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        _assert_nan(self, update.eval())
+        _assert_nan(self, metric.eval())
+      else:
+        self.assertEqual(expected, update.eval())
+        self.assertEqual(expected, metric.eval())
+
+  def test_one_label_at_k1_nan(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    # Classes 0,1 have 0 labels, 0 predictions, classes -1 and 4 are out of
+    # range.
+    for labels in (sparse_labels, dense_labels):
+      for class_id in (-1, 0, 1, 4):
+        self._test_streaming_sparse_recall_at_k(
+            predictions, labels, k=1, expected=NAN,
+            class_id=class_id)
+
+  def test_one_label_at_k1_no_predictions(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 2: 0 predictions.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.0,
+          class_id=2)
+
+  def test_one_label_at_k1(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 3: 1 label, 2 predictions, 1 correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1,
+          class_id=3)
+
+      # All classes: 2 labels, 2 predictions, 1 correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 2)
+
+  def test_one_label_at_k1_weighted(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    sparse_labels = _binary_2d_label_to_sparse_value(
+        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    dense_labels = np.array([[3], [2]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 3: 1 label, 2 predictions, 1 correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=NAN, class_id=3, weights=(0.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(1.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(2.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=NAN, class_id=3,
+          weights=(0.0, 0.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=NAN, class_id=3,
+          weights=(0.0, 1.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(1.0, 0.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(1.0, 1.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=2.0 / 2, class_id=3,
+          weights=(2.0, 3.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=3.0 / 3, class_id=3,
+          weights=(3.0, 2.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.3 / 0.3, class_id=3,
+          weights=(0.3, 0.6))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.6 / 0.6, class_id=3,
+          weights=(0.6, 0.3))
+
+      # All classes: 2 labels, 2 predictions, 1 correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=NAN, weights=(0.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 2, weights=(1.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 2, weights=(2.0,))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 1, weights=(1.0, 0.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.0 / 1, weights=(0.0, 1.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=1.0 / 2, weights=(1.0, 1.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=2.0 / 5, weights=(2.0, 3.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=3.0 / 5, weights=(3.0, 2.0))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.3 / 0.9, weights=(0.3, 0.6))
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=1, expected=0.6 / 0.9, weights=(0.6, 0.3))
+
+  def test_three_labels_at_k5_nan(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
+      for class_id in (0, 3, 4, 6, 9, 10):
+        self._test_streaming_sparse_recall_at_k(
+            predictions, labels, k=5, expected=NAN, class_id=class_id)
+
+  def test_three_labels_at_k5_no_predictions(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 8: 1 label, no predictions.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=0.0 / 1, class_id=8)
+
+  def test_three_labels_at_k5(self):
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    sparse_labels = _binary_2d_label_to_sparse_value([
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
+    dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Class 2: 2 labels, both correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=2.0 / 2, class_id=2)
+
+      # Class 5: 1 label, incorrect.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=1.0 / 1, class_id=5)
+
+      # Class 7: 1 label, incorrect.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=0.0 / 1, class_id=7)
+
+      # All classes: 6 labels, 3 correct.
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=3.0 / 6)
+
+  def test_three_labels_at_k5_some_out_of_range(self):
+    """Tests that labels outside the [0, n_classes) count in denominator."""
+    predictions = [
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    sp_labels = tf.SparseTensorValue(
+        indices=[[0, 0], [0, 1], [0, 2], [0, 3],
+                 [1, 0], [1, 1], [1, 2], [1, 3]],
+        # values -1 and 10 are outside the [0, n_classes) range.
+        values=np.array([2, 7, -1, 8,
+                         1, 2, 5, 10], np.int64),
+        shape=[2, 4])
+
+    # Class 2: 2 labels, both correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=2.0 / 2,
+        class_id=2)
+
+    # Class 5: 1 label, incorrect.
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=1.0 / 1,
+        class_id=5)
+
+    # Class 7: 1 label, incorrect.
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=0.0 / 1,
+        class_id=7)
+
+    # All classes: 8 labels, 3 correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=3.0 / 8)
+
+  def test_3d_nan(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    sparse_labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]
+    ]])
+    dense_labels = np.array([[
+        [2, 7, 8],
+        [1, 2, 5]
+    ], [
+        [1, 2, 5],
+        [2, 7, 8],
+    ]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
+      for class_id in (0, 3, 4, 6, 9, 10):
+        self._test_streaming_sparse_recall_at_k(
+            predictions, labels, k=5, expected=NAN, class_id=class_id)
+
+  def test_3d_no_predictions(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    sparse_labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]
+    ]])
+    dense_labels = np.array([[
+        [2, 7, 8],
+        [1, 2, 5]
+    ], [
+        [1, 2, 5],
+        [2, 7, 8],
+    ]], dtype=np.int64)
+
+    for labels in (sparse_labels, dense_labels):
+      # Classes 1,8 have 0 predictions, >=1 label.
+      for class_id in (1, 8):
+        self._test_streaming_sparse_recall_at_k(
+            predictions, labels, k=5, expected=0.0, class_id=class_id)
+
+  def test_3d(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Class 2: 4 labels, all correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=4.0 / 4, class_id=2)
+
+    # Class 5: 2 labels, both correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=2.0 / 2, class_id=5)
+
+    # Class 7: 2 labels, 1 incorrect.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=1.0 / 2, class_id=7)
+
+    # All classes: 12 labels, 7 correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=7.0 / 12)
+
+  def test_3d_ignore_all(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    for class_id in xrange(10):
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=NAN, class_id=class_id,
+          weights=[[0], [0]])
+      self._test_streaming_sparse_recall_at_k(
+          predictions, labels, k=5, expected=NAN, class_id=class_id,
+          weights=[[0, 0], [0, 0]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=NAN, weights=[[0], [0]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=NAN, weights=[[0, 0], [0, 0]])
+
+  def test_3d_ignore_some(self):
+    predictions = [[
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]
+    ], [
+        [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
+        [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]
+    ]]
+    labels = _binary_3d_label_to_sparse_value([[
+        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
+        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
+    ], [
+        [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
+    ]])
+
+    # Class 2: 2 labels, both correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
+        weights=[[1], [0]])
+
+    # Class 2: 2 labels, both correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
+        weights=[[0], [1]])
+
+    # Class 7: 1 label, correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=1.0 / 1.0, class_id=7,
+        weights=[[0], [1]])
+
+    # Class 7: 1 label, incorrect.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=0.0 / 1.0, class_id=7,
+        weights=[[1], [0]])
+
+    # Class 7: 2 labels, 1 correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=1.0 / 2.0, class_id=7,
+        weights=[[1, 0], [1, 0]])
+
+    # Class 7: No labels.
+    self._test_streaming_sparse_recall_at_k(
+        predictions, labels, k=5, expected=NAN, class_id=7,
+        weights=[[0, 1], [0, 1]])
+
+  def test_sparse_tensor_value(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    labels = [[0, 0, 1, 0], [0, 0, 0, 1]]
+    expected_recall = 0.5
+    with self.test_session():
+      _, recall = metrics.recall_at_k(
+          predictions=tf.constant(predictions, tf.float32),
+          labels=_binary_2d_label_to_sparse_value(labels), k=1)
+
+      tf.initialize_variables(tf.local_variables()).run()
+
+      self.assertEqual(expected_recall, recall.eval())
+
+
+class MeanAbsoluteErrorTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_absolute_error(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'mean_absolute_error/count:0',
+        'mean_absolute_error/total:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean_absolute_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_absolute_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_normal((10, 3), seed=1)
+    labels = tf.random_normal((10, 3), seed=2)
+    error, update_op = metrics.mean_absolute_error(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_error = error.eval()
+      for _ in range(10):
+        self.assertEqual(initial_error, error.eval())
+
+  def testSingleUpdateWithErrorAndWeights(self):
+    predictions = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
+    labels = tf.constant([1, 3, 2, 3], shape=(1, 4), dtype=tf.float32)
+    weights = tf.constant([0, 1, 0, 1], shape=(1, 4))
+
+    error, update_op = metrics.mean_absolute_error(
+        labels, predictions, weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(3, sess.run(update_op))
+      self.assertEqual(3, error.eval())
+
+
+class MeanRelativeErrorTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_relative_error(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)),
+        normalizer=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'mean_relative_error/count:0',
+        'mean_relative_error/total:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean_relative_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        normalizer=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(
+        tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_relative_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        normalizer=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_normal((10, 3), seed=1)
+    labels = tf.random_normal((10, 3), seed=2)
+    normalizer = tf.random_normal((10, 3), seed=3)
+    error, update_op = metrics.mean_relative_error(
+        labels, predictions, normalizer)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_error = error.eval()
+      for _ in range(10):
+        self.assertEqual(initial_error, error.eval())
+
+  def testSingleUpdateNormalizedByLabels(self):
+    np_predictions = np.asarray([2, 4, 6, 8], dtype=np.float32)
+    np_labels = np.asarray([1, 3, 2, 3], dtype=np.float32)
+    expected_error = np.mean(
+        np.divide(np.absolute(np_predictions - np_labels),
+                  np_labels))
+
+    predictions = tf.constant(np_predictions, shape=(1, 4), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(1, 4))
+
+    error, update_op = metrics.mean_relative_error(
+        labels, predictions, normalizer=labels)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(expected_error, sess.run(update_op))
+      self.assertEqual(expected_error, error.eval())
+
+  def testSingleUpdateNormalizedByZeros(self):
+    np_predictions = np.asarray([2, 4, 6, 8], dtype=np.float32)
+
+    predictions = tf.constant(np_predictions, shape=(1, 4), dtype=tf.float32)
+    labels = tf.constant([1, 3, 2, 3], shape=(1, 4), dtype=tf.float32)
+
+    error, update_op = metrics.mean_relative_error(
+        labels, predictions, normalizer=tf.zeros_like(labels))
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(0.0, sess.run(update_op))
+      self.assertEqual(0.0, error.eval())
+
+
+class MeanSquaredErrorTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_squared_error(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'mean_squared_error/count:0',
+        'mean_squared_error/total:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean_squared_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_squared_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_normal((10, 3), seed=1)
+    labels = tf.random_normal((10, 3), seed=2)
+    error, update_op = metrics.mean_squared_error(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_error = error.eval()
+      for _ in range(10):
+        self.assertEqual(initial_error, error.eval())
+
+  def testSingleUpdateZeroError(self):
+    predictions = tf.zeros((1, 3), dtype=tf.float32)
+    labels = tf.zeros((1, 3), dtype=tf.float32)
+
+    error, update_op = metrics.mean_squared_error(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(0, sess.run(update_op))
+      self.assertEqual(0, error.eval())
+
+  def testSingleUpdateWithError(self):
+    predictions = tf.constant([2, 4, 6], shape=(1, 3), dtype=tf.float32)
+    labels = tf.constant([1, 3, 2], shape=(1, 3), dtype=tf.float32)
+
+    error, update_op = metrics.mean_squared_error(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(6, sess.run(update_op))
+      self.assertEqual(6, error.eval())
+
+  def testSingleUpdateWithErrorAndWeights(self):
+    predictions = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
+    labels = tf.constant([1, 3, 2, 3], shape=(1, 4), dtype=tf.float32)
+    weights = tf.constant([0, 1, 0, 1], shape=(1, 4))
+
+    error, update_op = metrics.mean_squared_error(
+        labels, predictions, weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(13, sess.run(update_op))
+      self.assertEqual(13, error.eval())
+
+  def testMultipleBatchesOfSizeOne(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, preds_queue, [10, 8, 6])
+      _enqueue_vector(sess, preds_queue, [-4, 3, -1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, labels_queue, [1, 3, 2])
+      _enqueue_vector(sess, labels_queue, [2, 4, 6])
+      labels = labels_queue.dequeue()
+
+      error, update_op = metrics.mean_squared_error(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run(update_op)
+      self.assertAlmostEqual(208.0 / 6, sess.run(update_op), 5)
+
+      self.assertAlmostEqual(208.0 / 6, error.eval(), 5)
+
+  def testMetricsComputedConcurrently(self):
+    with self.test_session() as sess:
+      # Create the queue that populates one set of predictions.
+      preds_queue0 = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, preds_queue0, [10, 8, 6])
+      _enqueue_vector(sess, preds_queue0, [-4, 3, -1])
+      predictions0 = preds_queue0.dequeue()
+
+      # Create the queue that populates one set of predictions.
+      preds_queue1 = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, preds_queue1, [0, 1, 1])
+      _enqueue_vector(sess, preds_queue1, [1, 1, 0])
+      predictions1 = preds_queue1.dequeue()
+
+      # Create the queue that populates one set of labels.
+      labels_queue0 = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, labels_queue0, [1, 3, 2])
+      _enqueue_vector(sess, labels_queue0, [2, 4, 6])
+      labels0 = labels_queue0.dequeue()
+
+      # Create the queue that populates another set of labels.
+      labels_queue1 = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, labels_queue1, [-5, -3, -1])
+      _enqueue_vector(sess, labels_queue1, [5, 4, 3])
+      labels1 = labels_queue1.dequeue()
+
+      mse0, update_op0 = metrics.mean_squared_error(
+          labels0, predictions0, name='msd0')
+      mse1, update_op1 = metrics.mean_squared_error(
+          labels1, predictions1, name='msd1')
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([update_op0, update_op1])
+      sess.run([update_op0, update_op1])
+
+      mse0, mse1 = sess.run([mse0, mse1])
+      self.assertAlmostEqual(208.0 / 6, mse0, 5)
+      self.assertAlmostEqual(79.0 / 6, mse1, 5)
+
+  def testMultipleMetricsOnMultipleBatchesOfSizeOne(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, preds_queue, [10, 8, 6])
+      _enqueue_vector(sess, preds_queue, [-4, 3, -1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(2, dtypes=tf.float32, shapes=(1, 3))
+      _enqueue_vector(sess, labels_queue, [1, 3, 2])
+      _enqueue_vector(sess, labels_queue, [2, 4, 6])
+      labels = labels_queue.dequeue()
+
+      mae, ma_update_op = metrics.mean_absolute_error(
+          labels, predictions)
+      mse, ms_update_op = metrics.mean_squared_error(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([ma_update_op, ms_update_op])
+      sess.run([ma_update_op, ms_update_op])
+
+      self.assertAlmostEqual(32.0 / 6, mae.eval(), 5)
+      self.assertAlmostEqual(208.0 / 6, mse.eval(), 5)
+
+
+class RootMeanSquaredErrorTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.root_mean_squared_error(
+        predictions=tf.ones((10, 1)), labels=tf.ones((10, 1)))
+    _assert_local_variables(self, (
+        'root_mean_squared_error/count:0',
+        'root_mean_squared_error/total:0'
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.root_mean_squared_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.root_mean_squared_error(
+        predictions=tf.ones((10, 1)),
+        labels=tf.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_normal((10, 3), seed=1)
+    labels = tf.random_normal((10, 3), seed=2)
+    error, update_op = metrics.root_mean_squared_error(
+        labels, predictions)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_error = error.eval()
+      for _ in range(10):
+        self.assertEqual(initial_error, error.eval())
+
+  def testSingleUpdateZeroError(self):
+    with self.test_session() as sess:
+      predictions = tf.constant(0.0, shape=(1, 3), dtype=tf.float32)
+      labels = tf.constant(0.0, shape=(1, 3), dtype=tf.float32)
+
+      rmse, update_op = metrics.root_mean_squared_error(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(0, sess.run(update_op))
+
+      self.assertEqual(0, rmse.eval())
+
+  def testSingleUpdateWithError(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([2, 4, 6], shape=(1, 3), dtype=tf.float32)
+      labels = tf.constant([1, 3, 2], shape=(1, 3), dtype=tf.float32)
+
+      rmse, update_op = metrics.root_mean_squared_error(
+          labels, predictions)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(math.sqrt(6), update_op.eval(), 5)
+      self.assertAlmostEqual(math.sqrt(6), rmse.eval(), 5)
+
+  def testSingleUpdateWithErrorAndWeights(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([1, 3, 2, 3], shape=(1, 4), dtype=tf.float32)
+      weights = tf.constant([0, 1, 0, 1], shape=(1, 4))
+
+      rmse, update_op = metrics.root_mean_squared_error(
+          labels, predictions, weights)
+
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(math.sqrt(13), sess.run(update_op))
+
+      self.assertAlmostEqual(math.sqrt(13), rmse.eval(), 5)
+
+
+def _reweight(predictions, labels, weights):
+  return (np.concatenate([[p] * int(w) for p, w in zip(predictions, weights)]),
+          np.concatenate([[l] * int(w) for l, w in zip(labels, weights)]))
+
+
+class MeanCosineDistanceTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_cosine_distance(
+        predictions=tf.ones((10, 3)), labels=tf.ones((10, 3)), dim=1)
+    _assert_local_variables(self, (
+        'mean_cosine_distance/count:0',
+        'mean_cosine_distance/total:0',
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.mean_cosine_distance(
+        predictions=tf.ones((10, 3)),
+        labels=tf.ones((10, 3)),
+        dim=1,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_cosine_distance(
+        predictions=tf.ones((10, 3)),
+        labels=tf.ones((10, 3)),
+        dim=1,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = tf.random_normal((10, 3), seed=1)
+    labels = tf.random_normal((10, 3), seed=2)
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=1)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_error = error.eval()
+      for _ in range(10):
+        self.assertEqual(initial_error, error.eval())
+
+  def testSingleUpdateZeroError(self):
+    np_labels = np.matrix(('1 0 0;'
+                           '0 0 1;'
+                           '0 1 0'))
+
+    predictions = tf.constant(np_labels, shape=(1, 3, 3), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(1, 3, 3), dtype=tf.float32)
+
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=2)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(0, sess.run(update_op))
+      self.assertEqual(0, error.eval())
+
+  def testSingleUpdateWithError1(self):
+    np_labels = np.matrix(('1 0 0;'
+                           '0 0 1;'
+                           '0 1 0'))
+    np_predictions = np.matrix(('1 0 0;'
+                                '0 0 -1;'
+                                '1 0 0'))
+
+    predictions = tf.constant(np_predictions, shape=(3, 1, 3), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(3, 1, 3), dtype=tf.float32)
+
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=2)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1, sess.run(update_op), 5)
+      self.assertAlmostEqual(1, error.eval(), 5)
+
+  def testSingleUpdateWithError2(self):
+    np_predictions = np.matrix((
+        '0.819031913261206 0.567041924552012 0.087465312324590;'
+        '-0.665139432070255 -0.739487441769973 -0.103671883216994;'
+        '0.707106781186548 -0.707106781186548 0'))
+    np_labels = np.matrix((
+        '0.819031913261206 0.567041924552012 0.087465312324590;'
+        '0.665139432070255 0.739487441769973 0.103671883216994;'
+        '0.707106781186548 0.707106781186548 0'))
+
+    predictions = tf.constant(np_predictions, shape=(3, 1, 3), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(3, 1, 3), dtype=tf.float32)
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=2)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertAlmostEqual(1.0, sess.run(update_op), 5)
+      self.assertAlmostEqual(1.0, error.eval(), 5)
+
+  def testSingleUpdateWithErrorAndWeights1(self):
+    np_predictions = np.matrix(('1 0 0;'
+                                '0 0 -1;'
+                                '1 0 0'))
+    np_labels = np.matrix(('1 0 0;'
+                           '0 0 1;'
+                           '0 1 0'))
+
+    predictions = tf.constant(np_predictions, shape=(3, 1, 3), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(3, 1, 3), dtype=tf.float32)
+    weights = tf.constant([1, 0, 0], shape=(3, 1, 1), dtype=tf.float32)
+
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=2, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(0, sess.run(update_op))
+      self.assertEqual(0, error.eval())
+
+  def testSingleUpdateWithErrorAndWeights2(self):
+    np_predictions = np.matrix(('1 0 0;'
+                                '0 0 -1;'
+                                '1 0 0'))
+    np_labels = np.matrix(('1 0 0;'
+                           '0 0 1;'
+                           '0 1 0'))
+
+    predictions = tf.constant(np_predictions, shape=(3, 1, 3), dtype=tf.float32)
+    labels = tf.constant(np_labels, shape=(3, 1, 3), dtype=tf.float32)
+    weights = tf.constant([0, 1, 1], shape=(3, 1, 1), dtype=tf.float32)
+
+    error, update_op = metrics.mean_cosine_distance(
+        labels, predictions, dim=2, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(1.5, update_op.eval())
+      self.assertEqual(1.5, error.eval())
+
+
+class PcntBelowThreshTest(tf.test.TestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.percentage_below(values=tf.ones((10,)), threshold=2)
+    _assert_local_variables(self, (
+        'percentage_below_threshold/count:0',
+        'percentage_below_threshold/total:0',
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.percentage_below(
+        values=tf.ones((10,)),
+        threshold=2,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.percentage_below(
+        values=tf.ones((10,)),
+        threshold=2,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testOneUpdate(self):
+    with self.test_session() as sess:
+      values = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
+
+      pcnt0, update_op0 = metrics.percentage_below(
+          values, 100, name='high')
+      pcnt1, update_op1 = metrics.percentage_below(
+          values, 7, name='medium')
+      pcnt2, update_op2 = metrics.percentage_below(
+          values, 1, name='low')
+
+      sess.run(tf.local_variables_initializer())
+      sess.run([update_op0, update_op1, update_op2])
+
+      pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
+      self.assertAlmostEqual(1.0, pcnt0, 5)
+      self.assertAlmostEqual(0.75, pcnt1, 5)
+      self.assertAlmostEqual(0.0, pcnt2, 5)
+
+  def testSomePresentOneUpdate(self):
+    with self.test_session() as sess:
+      values = tf.constant([2, 4, 6, 8], shape=(1, 4), dtype=tf.float32)
+      weights = tf.constant([1, 0, 0, 1], shape=(1, 4), dtype=tf.float32)
+
+      pcnt0, update_op0 = metrics.percentage_below(
+          values, 100, weights=weights, name='high')
+      pcnt1, update_op1 = metrics.percentage_below(
+          values, 7, weights=weights, name='medium')
+      pcnt2, update_op2 = metrics.percentage_below(
+          values, 1, weights=weights, name='low')
+
+      sess.run(tf.local_variables_initializer())
+      self.assertListEqual([1.0, 0.5, 0.0],
+                           sess.run([update_op0, update_op1, update_op2]))
+
+      pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
+      self.assertAlmostEqual(1.0, pcnt0, 5)
+      self.assertAlmostEqual(0.5, pcnt1, 5)
+      self.assertAlmostEqual(0.0, pcnt2, 5)
+
+
+class MeanIOUTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    tf.reset_default_graph()
+
+  def testVars(self):
+    metrics.mean_iou(
+        predictions=tf.ones([10, 1]), labels=tf.ones([10, 1]), num_classes=2)
+    _assert_local_variables(self, ('mean_iou/total_confusion_matrix:0',))
+
+  def testMetricsCollections(self):
+    my_collection_name = '__metrics__'
+    mean_iou, _ = metrics.mean_iou(
+        predictions=tf.ones([10, 1]),
+        labels=tf.ones([10, 1]),
+        num_classes=2,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [mean_iou])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.mean_iou(
+        predictions=tf.ones([10, 1]),
+        labels=tf.ones([10, 1]),
+        num_classes=2,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(tf.get_collection(my_collection_name), [update_op])
+
+  def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
+    predictions = tf.ones([10, 3])
+    labels = tf.ones([10, 4])
+    with self.assertRaises(ValueError):
+      metrics.mean_iou(
+          labels, predictions, num_classes=2)
+
+  def testLabelsAndWeightsOfDifferentSizeRaisesValueError(self):
+    predictions = tf.ones([10])
+    labels = tf.ones([10])
+    weights = tf.zeros([9])
+    with self.assertRaises(ValueError):
+      metrics.mean_iou(
+          labels, predictions, num_classes=2, weights=weights)
+
+  def testValueTensorIsIdempotent(self):
+    num_classes = 3
+    predictions = tf.random_uniform([10], maxval=num_classes,
+                                    dtype=tf.int64, seed=1)
+    labels = tf.random_uniform([10], maxval=num_classes,
+                               dtype=tf.int64, seed=1)
+    miou, update_op = metrics.mean_iou(
+        labels, predictions, num_classes=num_classes)
+
+    with self.test_session() as sess:
+      sess.run(tf.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_miou = miou.eval()
+      for _ in range(10):
+        self.assertEqual(initial_miou, miou.eval())
+
+  def testMultipleUpdates(self):
+    num_classes = 3
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(5, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [2])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(5, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [2])
+      _enqueue_vector(sess, labels_queue, [1])
+      labels = labels_queue.dequeue()
+
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(5):
+        sess.run(update_op)
+      desired_output = np.mean([1.0/2.0, 1.0/4.0, 0.])
+      self.assertEqual(desired_output, miou.eval())
+
+  def testMultipleUpdatesWithWeights(self):
+    num_classes = 2
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      preds_queue = tf.FIFOQueue(6, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      labels_queue = tf.FIFOQueue(6, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      labels = labels_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(6, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [0.0])
+      _enqueue_vector(sess, weights_queue, [1.0])
+      _enqueue_vector(sess, weights_queue, [0.0])
+      weights = weights_queue.dequeue()
+
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes, weights=weights)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(6):
+        sess.run(update_op)
+      desired_output = np.mean([2.0/3.0, 1.0/2.0])
+      self.assertAlmostEqual(desired_output, miou.eval())
+
+  def testMultipleUpdatesWithMissingClass(self):
+    # Test the case where there are no predicions and labels for
+    # one class, and thus there is one row and one column with
+    # zero entries in the confusion matrix.
+    num_classes = 3
+    with self.test_session() as sess:
+      # Create the queue that populates the predictions.
+      # There is no prediction for class 2.
+      preds_queue = tf.FIFOQueue(5, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, preds_queue, [0])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [1])
+      _enqueue_vector(sess, preds_queue, [0])
+      predictions = preds_queue.dequeue()
+
+      # Create the queue that populates the labels.
+      # There is label for class 2.
+      labels_queue = tf.FIFOQueue(5, dtypes=tf.int32, shapes=(1, 1))
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [1])
+      _enqueue_vector(sess, labels_queue, [0])
+      _enqueue_vector(sess, labels_queue, [1])
+      labels = labels_queue.dequeue()
+
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes)
+
+      sess.run(tf.local_variables_initializer())
+      for _ in range(5):
+        sess.run(update_op)
+      desired_output = np.mean([1.0/3.0, 2.0/4.0, 0.])
+      self.assertAlmostEqual(desired_output, miou.eval())
+
+  def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
+    predictions = tf.concat(0,
+                            [tf.constant(0, shape=[5]),
+                             tf.constant(1, shape=[5])])
+    labels = tf.concat(0,
+                       [tf.constant(0, shape=[3]),
+                        tf.constant(1, shape=[7])])
+    num_classes = 2
+    with self.test_session() as sess:
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes)
+      sess.run(tf.local_variables_initializer())
+      confusion_matrix = update_op.eval()
+      self.assertAllEqual([[3, 2], [0, 5]], confusion_matrix)
+      desired_miou = np.mean([3./5., 5./7.])
+      self.assertAlmostEqual(desired_miou, miou.eval())
+
+  def testAllCorrect(self):
+    predictions = tf.zeros([40])
+    labels = tf.zeros([40])
+    num_classes = 1
+    with self.test_session() as sess:
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes)
+      sess.run(tf.local_variables_initializer())
+      self.assertEqual(40, update_op.eval()[0])
+      self.assertEqual(1.0, miou.eval())
+
+  def testAllWrong(self):
+    predictions = tf.zeros([40])
+    labels = tf.ones([40])
+    num_classes = 2
+    with self.test_session() as sess:
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes)
+      sess.run(tf.local_variables_initializer())
+      self.assertAllEqual([[0, 40], [0, 0]], update_op.eval())
+      self.assertEqual(0., miou.eval())
+
+  def testResultsWithSomeMissing(self):
+    predictions = tf.concat(0, [tf.constant(0, shape=[5]),
+                                tf.constant(1, shape=[5])])
+    labels = tf.concat(0, [tf.constant(0, shape=[3]),
+                           tf.constant(1, shape=[7])])
+    num_classes = 2
+    weights = tf.concat(0, [tf.constant(0, shape=[1]),
+                            tf.constant(1, shape=[8]),
+                            tf.constant(0, shape=[1])])
+    with self.test_session() as sess:
+      miou, update_op = metrics.mean_iou(
+          labels, predictions, num_classes, weights=weights)
+      sess.run(tf.local_variables_initializer())
+      self.assertAllEqual([[2, 2], [0, 4]], update_op.eval())
+      desired_miou = np.mean([2./4., 4./6.])
+      self.assertAlmostEqual(desired_miou, miou.eval())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
new file mode 100644
index 00000000000..0b9e79c640b
--- /dev/null
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -0,0 +1,163 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Confusion matrix related utilities.
+
+
+@@remove_squeezable_dimensions
+@@confusion_matrix
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+
+
+def remove_squeezable_dimensions(labels, predictions, name=None):
+  """Squeeze last dim if ranks of `predictions` and `labels` differ by 1.
+
+  This will use static shape if available. Otherwise, it will add graph
+  operations, which could result in a performance hit.
+
+  Args:
+    labels: Label values, a `Tensor` whose dimensions match `predictions`.
+    predictions: Predicted values, a `Tensor` of arbitrary dimensions.
+    name: Name of the op.
+
+  Returns:
+    Tuple of `labels` and `predictions`, possibly with last dim squeezed.
+  """
+  with ops.name_scope(name, 'remove_squeezable_dimensions',
+                      [labels, predictions]):
+    predictions = ops.convert_to_tensor(predictions)
+    labels = ops.convert_to_tensor(labels)
+    predictions_shape = predictions.get_shape()
+    predictions_rank = predictions_shape.ndims
+    labels_shape = labels.get_shape()
+    labels_rank = labels_shape.ndims
+    if (labels_rank is not None) and (predictions_rank is not None):
+      # Use static rank.
+      rank_diff = predictions_rank - labels_rank
+      if rank_diff == -1:
+        labels = array_ops.squeeze(labels, [-1])
+      elif rank_diff == 1:
+        predictions = array_ops.squeeze(predictions, [-1])
+      return labels, predictions
+
+    # Use dynamic rank.
+    rank_diff = array_ops.rank(predictions) - array_ops.rank(labels)
+    if (predictions_rank is None) or (
+        predictions_shape.dims[-1].is_compatible_with(1)):
+      predictions = control_flow_ops.cond(
+          math_ops.equal(1, rank_diff),
+          lambda: array_ops.squeeze(predictions, [-1]),
+          lambda: predictions)
+    if (labels_rank is None) or (
+        labels_shape.dims[-1].is_compatible_with(1)):
+      labels = control_flow_ops.cond(
+          math_ops.equal(-1, rank_diff),
+          lambda: array_ops.squeeze(labels, [-1]),
+          lambda: labels)
+    return labels, predictions
+
+
+def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
+                     name=None, weights=None):
+  """Computes the confusion matrix from predictions and labels.
+
+  Calculate the Confusion Matrix for a pair of prediction and
+  label 1-D int arrays.
+
+  The matrix rows represent the prediction labels and the columns
+  represents the real labels. The confusion matrix is always a 2-D array
+  of shape `[n, n]`, where `n` is the number of valid labels for a given
+  classification task. Both prediction and labels must be 1-D arrays of
+  the same shape in order for this function to work.
+
+  If `num_classes` is None, then `num_classes` will be set to the one plus
+  the maximum value in either predictions or labels.
+  Class labels are expected to start at 0. E.g., if `num_classes` was
+  three, then the possible labels would be `[0, 1, 2]`.
+
+  If `weights` is not `None`, then each prediction contributes its
+  corresponding weight to the total value of the confusion matrix cell.
+
+  For example:
+
+  ```python
+    tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+        [[0 0 0 0 0]
+         [0 0 1 0 0]
+         [0 0 1 0 0]
+         [0 0 0 0 0]
+         [0 0 0 0 1]]
+  ```
+
+  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
+  resulting in a 5x5 confusion matrix.
+
+  Args:
+    labels: A 1-D representing the real labels for the classification task.
+    predictions: A 1-D array representing the predictions for a given
+                 classification.
+    num_classes: The possible number of labels the classification task can
+                 have. If this value is not provided, it will be calculated
+                 using both predictions and labels array.
+    dtype: Data type of the confusion matrix.
+    name: Scope name.
+    weights: An optional `Tensor` whose shape matches `predictions`.
+
+  Returns:
+    A k X k matrix representing the confusion matrix, where k is the number of
+    possible labels in the classification task.
+
+  Raises:
+    ValueError: If both predictions and labels are not 1-D vectors and have
+      mismatched shapes, or if `weights` is not `None` and its shape doesn't
+      match `predictions`.
+  """
+  with ops.name_scope(name, 'confusion_matrix',
+                      [predictions, labels, num_classes]) as name:
+    labels, predictions = remove_squeezable_dimensions(
+        ops.convert_to_tensor(labels, name='labels'),
+        ops.convert_to_tensor(
+            predictions, name='predictions'))
+    predictions = math_ops.cast(predictions, dtypes.int64)
+    labels = math_ops.cast(labels, dtypes.int64)
+
+    if num_classes is None:
+      num_classes = math_ops.maximum(math_ops.reduce_max(predictions),
+                                     math_ops.reduce_max(labels)) + 1
+
+    if weights is not None:
+      predictions.get_shape().assert_is_compatible_with(weights.get_shape())
+      weights = math_ops.cast(weights, dtype)
+
+    shape = array_ops.pack([num_classes, num_classes])
+    indices = array_ops.transpose(array_ops.pack([predictions, labels]))
+    values = (array_ops.ones_like(predictions, dtype)
+              if weights is None else weights)
+    cm_sparse = sparse_tensor.SparseTensor(
+        indices=indices, values=values, shape=math_ops.to_int64(shape))
+    zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)
+
+    return sparse_ops.sparse_add(zero_matrix, cm_sparse)
diff --git a/tensorflow/python/ops/metrics.py b/tensorflow/python/ops/metrics.py
new file mode 100644
index 00000000000..1394e4b7612
--- /dev/null
+++ b/tensorflow/python/ops/metrics.py
@@ -0,0 +1,2588 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Evaluation-related metrics.
+
+@@accuracy
+@@auc
+@@mean
+@@mean_absolute_error
+@@mean_cosine_distance
+@mean_iou
+@@mean_relative_error
+@@mean_squared_error
+@@mean_tensor
+@@percentage_below
+@@precision
+@@precision_at_thresholds
+@@recall
+@@recall_at_k
+@@recall_at_thresholds
+@@root_mean_squared_error
+@@sensitivity_at_specificity
+@@sparse_average_precision_at_k
+@@sparse_precision_at_k
+@@specificity_at_sensitivity
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import confusion_matrix
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import sets
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+
+
+def _local_variable(initial_value, validate_shape=True, name=None):
+  """Create variable and add it to `GraphKeys.LOCAL_VARIABLES` collection.
+
+  Args:
+    initial_value: See variables.Variable.__init__.
+    validate_shape: See variables.Variable.__init__.
+    name: See variables.Variable.__init__.
+  Returns:
+    New variable.
+  """
+  return variables.Variable(
+      initial_value, trainable=False,
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      validate_shape=validate_shape, name=name)
+
+
+def _remove_squeezable_dimensions(labels, predictions, weights):
+  """Internal version of _remove_squeezable_dimensions which handles weights.
+
+  Squeezes `predictions` and `labels` if their rank differs by 1.
+  Squeezes `weights` if its rank is 1 more than the new rank of `predictions`
+
+  This will use static shape if available. Otherwise, it will add graph
+  operations, which could result in a performance hit.
+
+  Args:
+    labels: Label values, a `Tensor` whose dimensions match `predictions`.
+    predictions: Predicted values, a `Tensor` of arbitrary dimensions.
+    weights: Optional weight `Tensor`. It will be squeezed if its rank is 1
+      more than the new rank of `predictions`
+
+  Returns:
+    Tuple of `predictions`, `labels` and `weights`, possibly with the last
+    dimension squeezed.
+  """
+  labels, predictions = confusion_matrix.remove_squeezable_dimensions(
+      labels, predictions)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+  if weights is not None:
+    weights = ops.convert_to_tensor(weights)
+    predictions_shape = predictions.get_shape()
+    predictions_rank = predictions_shape.ndims
+    weights_shape = weights.get_shape()
+    weights_rank = weights_shape.ndims
+
+    if (predictions_rank is not None) and (weights_rank is not None):
+      # Use static rank.
+      if weights_rank - predictions_rank == 1:
+        weights = array_ops.squeeze(weights, [-1])
+    elif (weights_rank is None) or (
+        weights_shape.dims[-1].is_compatible_with(1)):
+      # Use dynamic rank
+      weights = control_flow_ops.cond(
+          math_ops.equal(array_ops.rank(weights),
+                         math_ops.add(array_ops.rank(predictions), 1)),
+          lambda: array_ops.squeeze(weights, [-1]),
+          lambda: weights)
+  return labels, predictions, weights
+
+
+def _create_local(name, shape, collections=None, validate_shape=True,
+                  dtype=dtypes.float32):
+  """Creates a new local variable.
+
+  Args:
+    name: The name of the new or existing variable.
+    shape: Shape of the new or existing variable.
+    collections: A list of collection names to which the Variable will be added.
+    validate_shape: Whether to validate the shape of the variable.
+    dtype: Data type of the variables.
+
+  Returns:
+    The created variable.
+  """
+  # Make sure local variables are added to tf.GraphKeys.LOCAL_VARIABLES
+  collections = list(collections or [])
+  collections += [ops.GraphKeys.LOCAL_VARIABLES]
+  return variables.Variable(
+      initial_value=array_ops.zeros(shape, dtype=dtype),
+      name=name,
+      trainable=False,
+      collections=collections,
+      validate_shape=validate_shape)
+
+
+def _broadcast_weights(weights, values):
+  """Broadcast `weights` to the same shape as `values`.
+
+  This returns a version of `weights` following the same broadcast rules as
+  `mul(weights, values)`. When computing a weighted average, use this function
+  to broadcast `weights` before summing them; e.g.,
+  `reduce_sum(w * v) / reduce_sum(_broadcast_weights(w, v))`.
+
+  Args:
+    weights: `Tensor` whose shape is broadcastable to `values`.
+    values: `Tensor` of any shape.
+
+  Returns:
+    `weights` broadcast to `values` shape.
+  """
+  weights_shape = weights.get_shape()
+  values_shape = values.get_shape()
+  if (weights_shape.is_fully_defined() and
+      values_shape.is_fully_defined() and
+      weights_shape.is_compatible_with(values_shape)):
+    return weights
+  return math_ops.mul(
+      weights, array_ops.ones_like(values), name='broadcast_weights')
+
+
+def _safe_div(numerator, denominator, name):
+  """Divides two values, returning 0 if the denominator is <= 0.
+
+  Args:
+    numerator: A real `Tensor`.
+    denominator: A real `Tensor`, with dtype matching `numerator`.
+    name: Name for the returned op.
+
+  Returns:
+    0 if `denominator` <= 0, else `numerator` / `denominator`
+  """
+  return math_ops.select(
+      math_ops.greater(denominator, 0),
+      math_ops.truediv(numerator, denominator),
+      0,
+      name=name)
+
+
+def _safe_scalar_div(numerator, denominator, name):
+  """Divides two values, returning 0 if the denominator is 0.
+
+  Args:
+    numerator: A scalar `float64` `Tensor`.
+    denominator: A scalar `float64` `Tensor`.
+    name: Name for the returned op.
+
+  Returns:
+    0 if `denominator` == 0, else `numerator` / `denominator`
+  """
+  numerator.get_shape().with_rank_at_most(1)
+  denominator.get_shape().with_rank_at_most(1)
+  return control_flow_ops.cond(
+      math_ops.equal(
+          array_ops.constant(0.0, dtype=dtypes.float64), denominator),
+      lambda: array_ops.constant(0.0, dtype=dtypes.float64),
+      lambda: math_ops.div(numerator, denominator),
+      name=name)
+
+
+def mean(values, weights=None, metrics_collections=None,
+         updates_collections=None, name=None):
+  """Computes the (weighted) mean of the given values.
+
+  The `mean` function creates two local variables, `total` and `count`
+  that are used to compute the average of `values`. This average is ultimately
+  returned as `mean` which is an idempotent operation that simply divides
+  `total` by `count`.
+
+  For estimation of the metric  over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `mean`.
+  `update_op` increments `total` with the reduced sum of the product of `values`
+  and `weights`, and it increments `count` with the reduced sum of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    values: A `Tensor` of arbitrary dimensions.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
+    metrics_collections: An optional list of collections that `mean`
+      should be added to.
+    updates_collections: An optional list of collections that `update_op`
+      should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean: A `Tensor` representing the current mean, the value of `total` divided
+      by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `mean_value`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
+  """
+  with variable_scope.variable_scope(name, 'mean', (values, weights)):
+    values = math_ops.to_float(values)
+
+    total = _create_local('total', shape=[])
+    count = _create_local('count', shape=[])
+
+    if weights is not None:
+      weights = math_ops.to_float(weights)
+      values = math_ops.mul(values, weights)
+      num_values = math_ops.reduce_sum(_broadcast_weights(weights, values))
+    else:
+      num_values = math_ops.to_float(array_ops.size(values))
+
+    total_compute_op = state_ops.assign_add(total, math_ops.reduce_sum(values))
+    count_compute_op = state_ops.assign_add(count, num_values)
+
+    mean_t = _safe_div(total, count, 'value')
+    with ops.control_dependencies([total_compute_op, count_compute_op]):
+      update_op = _safe_div(total, count, 'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_t)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return mean_t, update_op
+
+
+def accuracy(labels, predictions, weights=None, metrics_collections=None,
+             updates_collections=None, name=None):
+  """Calculates how often `predictions` matches `labels`.
+
+  The `accuracy` function creates two local variables, `total` and
+  `count` that are used to compute the frequency with which `predictions`
+  matches `labels`. This frequency is ultimately returned as `accuracy`: an
+  idempotent operation that simply divides `total` by `count`.
+
+  For estimation of the metric  over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `accuracy`.
+  Internally, an `is_correct` operation computes a `Tensor` with elements 1.0
+  where the corresponding elements of `predictions` and `labels` match and 0.0
+  otherwise. Then `update_op` increments `total` with the reduced sum of the
+  product of `weights` and `is_correct`, and it increments `count` with the
+  reduced sum of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `Tensor` whose shape matches
+      `predictions`.
+    predictions: The predicted values, a `Tensor` of any shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `accuracy` should
+      be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    accuracy: A `Tensor` representing the accuracy, the value of `total` divided
+      by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `accuracy`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights=weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+  if labels.dtype != predictions.dtype:
+    predictions = math_ops.cast(predictions, labels.dtype)
+  is_correct = math_ops.to_float(math_ops.equal(predictions, labels))
+  return mean(is_correct, weights, metrics_collections,
+              updates_collections, name or 'accuracy')
+
+
+def _confusion_matrix_at_thresholds(
+    labels, predictions, thresholds, weights=None, includes=None):
+  """Computes true_positives, false_negatives, true_negatives, false_positives.
+
+  This function creates up to four local variables, `true_positives`,
+  `true_negatives`, `false_positives` and `false_negatives`.
+  `true_positive[i]` is defined as the total weight of values in `predictions`
+  above `thresholds[i]` whose corresponding entry in `labels` is `True`.
+  `false_negatives[i]` is defined as the total weight of values in `predictions`
+  at most `thresholds[i]` whose corresponding entry in `labels` is `True`.
+  `true_negatives[i]` is defined as the total weight of values in `predictions`
+  at most `thresholds[i]` whose corresponding entry in `labels` is `False`.
+  `false_positives[i]` is defined as the total weight of values in `predictions`
+  above `thresholds[i]` whose corresponding entry in `labels` is `False`.
+
+  For estimation of these metrics over a stream of data, for each metric the
+  function respectively creates an `update_op` operation that updates the
+  variable and returns its value.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` whose shape matches `predictions`. `labels` will be cast
+      to `bool`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    thresholds: A python list or tuple of float thresholds in `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    includes: Tuple of keys to return, from 'tp', 'fn', 'tn', fp'. If `None`,
+        default to all four.
+
+  Returns:
+    values: Dict of variables of shape `[len(thresholds)]`. Keys are from
+        `includes`.
+    update_ops: Dict of operations that increments the `values`. Keys are from
+        `includes`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      `includes` contains invalid keys.
+  """
+  all_includes = ('tp', 'fn', 'tn', 'fp')
+  if includes is None:
+    includes = all_includes
+  else:
+    for include in includes:
+      if include not in all_includes:
+        raise ValueError('Invaild key: %s.' % include)
+
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+  num_thresholds = len(thresholds)
+
+  # Reshape predictions and labels.
+  predictions_2d = array_ops.reshape(predictions, [-1, 1])
+  labels_2d = array_ops.reshape(
+      math_ops.cast(labels, dtype=dtypes.bool), [1, -1])
+
+  # Use static shape if known.
+  num_predictions = predictions_2d.get_shape().as_list()[0]
+
+  # Otherwise use dynamic shape.
+  if num_predictions is None:
+    num_predictions = array_ops.shape(predictions_2d)[0]
+  thresh_tiled = array_ops.tile(
+      array_ops.expand_dims(array_ops.constant(thresholds), [1]),
+      array_ops.pack([1, num_predictions]))
+
+  # Tile the predictions after thresholding them across different thresholds.
+  pred_is_pos = math_ops.greater(
+      array_ops.tile(array_ops.transpose(predictions_2d), [num_thresholds, 1]),
+      thresh_tiled)
+  if ('fn' in includes) or ('tn' in includes):
+    pred_is_neg = math_ops.logical_not(pred_is_pos)
+
+  # Tile labels by number of thresholds
+  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
+  if ('fp' in includes) or ('tn' in includes):
+    label_is_neg = math_ops.logical_not(label_is_pos)
+
+  if weights is not None:
+    weights = math_ops.to_float(weights)
+    weights_tiled = array_ops.tile(array_ops.reshape(_broadcast_weights(
+        weights, predictions), [1, -1]), [num_thresholds, 1])
+    thresh_tiled.get_shape().assert_is_compatible_with(
+        weights_tiled.get_shape())
+  else:
+    weights_tiled = None
+
+  values = {}
+  update_ops = {}
+
+  if 'tp' in includes:
+    true_p = _create_local('true_positives', shape=[num_thresholds])
+    is_true_positive = math_ops.to_float(
+        math_ops.logical_and(label_is_pos, pred_is_pos))
+    if weights_tiled is not None:
+      is_true_positive *= weights_tiled
+    update_ops['tp'] = state_ops.assign_add(
+        true_p, math_ops.reduce_sum(is_true_positive, 1))
+    values['tp'] = true_p
+
+  if 'fn' in includes:
+    false_n = _create_local('false_negatives', shape=[num_thresholds])
+    is_false_negative = math_ops.to_float(
+        math_ops.logical_and(label_is_pos, pred_is_neg))
+    if weights_tiled is not None:
+      is_false_negative *= weights_tiled
+    update_ops['fn'] = state_ops.assign_add(
+        false_n, math_ops.reduce_sum(is_false_negative, 1))
+    values['fn'] = false_n
+
+  if 'tn' in includes:
+    true_n = _create_local('true_negatives', shape=[num_thresholds])
+    is_true_negative = math_ops.to_float(
+        math_ops.logical_and(label_is_neg, pred_is_neg))
+    if weights_tiled is not None:
+      is_true_negative *= weights_tiled
+    update_ops['tn'] = state_ops.assign_add(
+        true_n, math_ops.reduce_sum(is_true_negative, 1))
+    values['tn'] = true_n
+
+  if 'fp' in includes:
+    false_p = _create_local('false_positives', shape=[num_thresholds])
+    is_false_positive = math_ops.to_float(
+        math_ops.logical_and(label_is_neg, pred_is_pos))
+    if weights_tiled is not None:
+      is_false_positive *= weights_tiled
+    update_ops['fp'] = state_ops.assign_add(
+        false_p, math_ops.reduce_sum(is_false_positive, 1))
+    values['fp'] = false_p
+
+  return values, update_ops
+
+
+def auc(labels, predictions, weights=None, num_thresholds=200,
+        metrics_collections=None, updates_collections=None,
+        curve='ROC', name=None):
+  """Computes the approximate AUC via a Riemann sum.
+
+  The `auc` function creates four local variables, `true_positives`,
+  `true_negatives`, `false_positives` and `false_negatives` that are used to
+  compute the AUC. To discretize the AUC curve, a linearly spaced set of
+  thresholds is used to compute pairs of recall and precision values. The area
+  under the ROC-curve is therefore computed using the height of the recall
+  values by the false positive rate, while the area under the PR-curve is the
+  computed using the height of the precision values by the recall.
+
+  This value is ultimately returned as `auc`, an idempotent operation that
+  computes the area under a discretized curve of precision versus recall values
+  (computed using the aforementioned variables). The `num_thresholds` variable
+  controls the degree of discretization with larger numbers of thresholds more
+  closely approximating the true AUC. The quality of the approximation may vary
+  dramatically depending on `num_thresholds`.
+
+  For best results, `predictions` should be distributed approximately uniformly
+  in the range [0, 1] and not peaked around 0 or 1. The quality of the AUC
+  approximation may be poor if this is not the case.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `auc`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    num_thresholds: The number of thresholds to use when discretizing the roc
+      curve.
+    metrics_collections: An optional list of collections that `auc` should be
+      added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    curve: Specifies the name of the curve to be computed, 'ROC' [default] or
+    'PR' for the Precision-Recall-curve.
+    name: An optional variable_scope name.
+
+  Returns:
+    auc: A scalar `Tensor` representing the current area-under-curve.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables
+      appropriately and whose value matches `auc`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'auc', (labels, predictions, weights)):
+    if curve != 'ROC' and  curve != 'PR':
+      raise ValueError('curve must be either ROC or PR, %s unknown' %
+                       (curve))
+    kepsilon = 1e-7  # to account for floating point imprecisions
+    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                  for i in range(num_thresholds-2)]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+    values, update_ops = _confusion_matrix_at_thresholds(
+        labels, predictions, thresholds, weights)
+
+    # Add epsilons to avoid dividing by 0.
+    epsilon = 1.0e-6
+    def compute_auc(tp, fn, tn, fp, name):
+      """Computes the roc-auc or pr-auc based on confusion counts."""
+      rec = math_ops.div(tp + epsilon, tp + fn + epsilon)
+      if curve == 'ROC':
+        fp_rate = math_ops.div(fp, fp + tn + epsilon)
+        x = fp_rate
+        y = rec
+      else:  # curve == 'PR'.
+        prec = math_ops.div(tp + epsilon, tp + fp + epsilon)
+        x = rec
+        y = prec
+      return math_ops.reduce_sum(math_ops.mul(
+          x[:num_thresholds - 1] - x[1:],
+          (y[:num_thresholds - 1] + y[1:]) / 2.), name=name)
+
+    # sum up the areas of all the trapeziums
+    auc_value = compute_auc(
+        values['tp'], values['fn'], values['tn'], values['fp'], 'value')
+    update_op = compute_auc(
+        update_ops['tp'], update_ops['fn'], update_ops['tn'], update_ops['fp'],
+        'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, auc_value)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return auc_value, update_op
+
+
+def mean_absolute_error(labels, predictions, weights=None,
+                        metrics_collections=None,
+                        updates_collections=None,
+                        name=None):
+  """Computes the mean absolute error between the labels and predictions.
+
+  The `mean_absolute_error` function creates two local variables,
+  `total` and `count` that are used to compute the mean absolute error. This
+  average is weighted by `weights`, and it is ultimately returned as
+  `mean_absolute_error`: an idempotent operation that simply divides `total` by
+  `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `mean_absolute_error`. Internally, an `absolute_errors` operation computes the
+  absolute value of the differences between `predictions` and `labels`. Then
+  `update_op` increments `total` with the reduced sum of the product of
+  `weights` and `absolute_errors`, and it increments `count` with the reduced
+  sum of `weights`
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of the same shape as `predictions`.
+    predictions: A `Tensor` of arbitrary shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that
+      `mean_absolute_error` should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_absolute_error: A `Tensor` representing the current mean, the value of
+      `total` divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `mean_absolute_error`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  predictions, labels, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+  absolute_errors = math_ops.abs(predictions - labels)
+  return mean(absolute_errors, weights, metrics_collections,
+              updates_collections, name or 'mean_absolute_error')
+
+
+def mean_cosine_distance(labels, predictions, dim, weights=None,
+                         metrics_collections=None,
+                         updates_collections=None,
+                         name=None):
+  """Computes the cosine distance between the labels and predictions.
+
+  The `mean_cosine_distance` function creates two local variables,
+  `total` and `count` that are used to compute the average cosine distance
+  between `predictions` and `labels`. This average is weighted by `weights`,
+  and it is ultimately returned as `mean_distance`, which is an idempotent
+  operation that simply divides `total` by `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `mean_distance`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of arbitrary shape.
+    predictions: A `Tensor` of the same shape as `labels`.
+    dim: The dimension along which the cosine distance is computed.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`,
+      and whose dimension `dim` is 1.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_distance: A `Tensor` representing the current mean, the value of
+      `total` divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+  radial_diffs = math_ops.mul(predictions, labels)
+  radial_diffs = math_ops.reduce_sum(radial_diffs,
+                                     reduction_indices=[dim,],
+                                     keep_dims=True)
+  mean_distance, update_op = mean(radial_diffs, weights,
+                                  None,
+                                  None,
+                                  name or 'mean_cosine_distance')
+  mean_distance = math_ops.sub(1.0, mean_distance)
+  update_op = math_ops.sub(1.0, update_op)
+
+  if metrics_collections:
+    ops.add_to_collections(metrics_collections, mean_distance)
+
+  if updates_collections:
+    ops.add_to_collections(updates_collections, update_op)
+
+  return mean_distance, update_op
+
+
+def mean_iou(labels,
+             predictions,
+             num_classes,
+             weights=None,
+             metrics_collections=None,
+             updates_collections=None,
+             name=None):
+  """Calculate per-step mean Intersection-Over-Union (mIOU).
+
+  Mean Intersection-Over-Union is a common evaluation metric for
+  semantic image segmentation, which first computes the IOU for each
+  semantic class and then computes the average over classes.
+  IOU is defined as follows:
+    IOU = true_positive / (true_positive + false_positive + false_negative).
+  The predictions are accumulated in a confusion matrix, weighted by `weights`,
+  and mIOU is then calculated from it.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `mean_iou`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of ground truth labels with shape [batch size] and of
+      type `int32` or `int64`. The tensor will be flattened, if its rank > 1.
+    predictions: A `Tensor` of prediction results for semantic labels, whose
+      shape is [batch size] and type `int32` or `int64`. The tensor will be
+      flattened, if its rank > 1.
+    num_classes: The possible number of labels the prediction task can
+      have. This value must be provided, since a confusion matrix of
+      dimension = [num_classes, num_classes] will be allocated.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `mean_iou`
+      should be added to.
+    updates_collections: An optional list of collections `update_op` should be
+      added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_iou: A `Tensor` representing the mean intersection-over-union.
+    update_op: An operation that increments the confusion matrix.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'mean_iou', (predictions, labels, weights)):
+    # Check if shape is compatible.
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+    # Local variable to accumulate the predictions in the confusion matrix.
+    cm_dtype = dtypes.int64 if weights is not None else dtypes.float64
+    total_cm = _create_local('total_confusion_matrix',
+                             shape=[num_classes, num_classes], dtype=cm_dtype)
+
+    # Cast the type to int64 required by confusion_matrix_ops.
+    predictions = math_ops.to_int64(predictions)
+    labels = math_ops.to_int64(labels)
+    num_classes = math_ops.to_int64(num_classes)
+
+    # Flatten the input if its rank > 1.
+    predictions_rank = predictions.get_shape().ndims
+    if predictions_rank > 1:
+      predictions = array_ops.reshape(predictions, [-1])
+
+    labels_rank = labels.get_shape().ndims
+    if labels_rank > 1:
+      labels = array_ops.reshape(labels, [-1])
+
+    if weights is not None:
+      weights_rank = weights.get_shape().ndims
+      if weights_rank > 1:
+        weights = array_ops.reshape(weights, [-1])
+
+    # Accumulate the prediction to current confusion matrix.
+    current_cm = confusion_matrix.confusion_matrix(
+        labels, predictions, num_classes, weights=weights, dtype=cm_dtype)
+    update_op = state_ops.assign_add(total_cm, current_cm)
+
+    def compute_mean_iou(name):
+      """Compute the mean intersection-over-union via the confusion matrix."""
+      sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
+      sum_over_col = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
+      cm_diag = math_ops.to_float(array_ops.diag_part(total_cm))
+      denominator = sum_over_row + sum_over_col - cm_diag
+
+      # If the value of the denominator is 0, set it to 1 to avoid
+      # zero division.
+      denominator = math_ops.select(
+          math_ops.greater(denominator, 0),
+          denominator,
+          array_ops.ones_like(denominator))
+      iou = math_ops.div(cm_diag, denominator)
+      return math_ops.reduce_mean(iou, name=name)
+
+    mean_iou_v = compute_mean_iou('mean_iou')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_iou_v)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return mean_iou_v, update_op
+
+
+def mean_relative_error(labels, predictions, normalizer, weights=None,
+                        metrics_collections=None,
+                        updates_collections=None,
+                        name=None):
+  """Computes the mean relative error by normalizing with the given values.
+
+  The `mean_relative_error` function creates two local variables,
+  `total` and `count` that are used to compute the mean relative absolute error.
+  This average is weighted by `weights`, and it is ultimately returned as
+  `mean_relative_error`: an idempotent operation that simply divides `total` by
+  `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `mean_reative_error`. Internally, a `relative_errors` operation divides the
+  absolute value of the differences between `predictions` and `labels` by the
+  `normalizer`. Then `update_op` increments `total` with the reduced sum of the
+  product of `weights` and `relative_errors`, and it increments `count` with the
+  reduced sum of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of the same shape as `predictions`.
+    predictions: A `Tensor` of arbitrary shape.
+    normalizer: A `Tensor` of the same shape as `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that
+      `mean_relative_error` should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_relative_error: A `Tensor` representing the current mean, the value of
+      `total` divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `mean_relative_error`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+  predictions, normalizer = confusion_matrix.remove_squeezable_dimensions(
+      predictions, normalizer)
+  predictions.get_shape().assert_is_compatible_with(normalizer.get_shape())
+  relative_errors = math_ops.select(
+      math_ops.equal(normalizer, 0.0),
+      array_ops.zeros_like(labels),
+      math_ops.div(math_ops.abs(labels - predictions), normalizer))
+  return mean(relative_errors, weights, metrics_collections,
+              updates_collections, name or 'mean_relative_error')
+
+
+def mean_squared_error(labels, predictions, weights=None,
+                       metrics_collections=None,
+                       updates_collections=None,
+                       name=None):
+  """Computes the mean squared error between the labels and predictions.
+
+  The `mean_squared_error` function creates two local variables,
+  `total` and `count` that are used to compute the mean squared error.
+  This average is weighted by `weights`, and it is ultimately returned as
+  `mean_squared_error`: an idempotent operation that simply divides `total` by
+  `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `mean_squared_error`. Internally, a `squared_error` operation computes the
+  element-wise square of the difference between `predictions` and `labels`. Then
+  `update_op` increments `total` with the reduced sum of the product of
+  `weights` and `squared_error`, and it increments `count` with the reduced sum
+  of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of the same shape as `predictions`.
+    predictions: A `Tensor` of arbitrary shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that
+      `mean_squared_error` should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean_squared_error: A `Tensor` representing the current mean, the value of
+      `total` divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `mean_squared_error`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+  squared_error = math_ops.square(labels - predictions)
+  return mean(squared_error, weights, metrics_collections,
+              updates_collections, name or 'mean_squared_error')
+
+
+def mean_tensor(values, weights=None, metrics_collections=None,
+                updates_collections=None, name=None):
+  """Computes the element-wise (weighted) mean of the given tensors.
+
+  In contrast to the `mean` function which returns a scalar with the
+  mean,  this function returns an average tensor with the same shape as the
+  input tensors.
+
+  The `mean_tensor` function creates two local variables,
+  `total_tensor` and `count_tensor` that are used to compute the average of
+  `values`. This average is ultimately returned as `mean` which is an idempotent
+  operation that simply divides `total` by `count`.
+
+  For estimation of the metric  over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `mean`.
+  `update_op` increments `total` with the reduced sum of the product of `values`
+  and `weights`, and it increments `count` with the reduced sum of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    values: A `Tensor` of arbitrary dimensions.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
+    metrics_collections: An optional list of collections that `mean`
+      should be added to.
+    updates_collections: An optional list of collections that `update_op`
+      should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    mean: A float `Tensor` representing the current mean, the value of `total`
+      divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `mean_value`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
+  """
+  with variable_scope.variable_scope(name, 'mean', (values, weights)):
+    total = _create_local('total_tensor', shape=values.get_shape())
+    count = _create_local('count_tensor', shape=values.get_shape())
+
+    num_values = array_ops.ones_like(values)
+    if weights is not None:
+      weights = math_ops.to_float(weights)
+      values = math_ops.mul(values, weights)
+      num_values = math_ops.mul(num_values, weights)
+
+    total_compute_op = state_ops.assign_add(total, values)
+    count_compute_op = state_ops.assign_add(count, num_values)
+
+    def compute_mean(total, count, name):
+      non_zero_count = math_ops.maximum(count,
+                                        array_ops.ones_like(count),
+                                        name=name)
+      return math_ops.truediv(total, non_zero_count, name=name)
+
+    mean_t = compute_mean(total, count, 'value')
+    with ops.control_dependencies([total_compute_op, count_compute_op]):
+      update_op = compute_mean(total, count, 'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_t)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return mean_t, update_op
+
+
+def percentage_below(values, threshold, weights=None,
+                     metrics_collections=None,
+                     updates_collections=None,
+                     name=None):
+  """Computes the percentage of values less than the given threshold.
+
+  The `percentage_below` function creates two local variables,
+  `total` and `count` that are used to compute the percentage of `values` that
+  fall below `threshold`. This rate is weighted by `weights`, and it is
+  ultimately returned as `percentage` which is an idempotent operation that
+  simply divides `total` by `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `percentage`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    values: A numeric `Tensor` of arbitrary size.
+    threshold: A scalar threshold.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    percentage: A `Tensor` representing the current mean, the value of `total`
+      divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
+  """
+  is_below_threshold = math_ops.to_float(math_ops.less(values, threshold))
+  return mean(is_below_threshold,
+              weights,
+              metrics_collections,
+              updates_collections,
+              name or 'percentage_below_threshold')
+
+
+def _count_condition(values, weights=None, metrics_collections=None,
+                     updates_collections=None):
+  """Sums the weights of cases where the given values are True.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    values: A `bool` `Tensor` of arbitrary size.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+
+  Returns:
+    value_tensor: A `Tensor` representing the current value of the metric.
+    update_op: An operation that accumulates the error from a batch of data.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
+  """
+  check_ops.assert_type(values, dtypes.bool)
+  count = _create_local('count', shape=[])
+
+  values = math_ops.to_float(values)
+  if weights is not None:
+    weights = math_ops.to_float(weights)
+    values = math_ops.mul(values, weights)
+
+  value_tensor = array_ops.identity(count)
+  update_op = state_ops.assign_add(count, math_ops.reduce_sum(values))
+
+  if metrics_collections:
+    ops.add_to_collections(metrics_collections, value_tensor)
+
+  if updates_collections:
+    ops.add_to_collections(updates_collections, update_op)
+
+  return value_tensor, update_op
+
+
+def true_positives(labels, predictions, weights=None,
+                   metrics_collections=None,
+                   updates_collections=None,
+                   name=None):
+  """Sum the weights of true_positives.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
+      match `predictions`.
+    predictions: The predicted values, a `bool` `Tensor` of arbitrary
+      dimensions.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    value_tensor: A `Tensor` representing the current value of the metric.
+    update_op: An operation that accumulates the error from a batch of data.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'true_positives', (predictions, labels, weights)):
+
+    predictions = ops.convert_to_tensor(predictions)
+    labels = ops.convert_to_tensor(labels)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    is_true_positive = math_ops.logical_and(math_ops.equal(labels, 1),
+                                            math_ops.equal(predictions, 1))
+    return _count_condition(is_true_positive, weights, metrics_collections,
+                            updates_collections)
+
+
+def false_positives(labels, predictions, weights=None,
+                    metrics_collections=None,
+                    updates_collections=None,
+                    name=None):
+  """Sum the weights of false positives.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
+      match `predictions`.
+    predictions: The predicted values, a `bool` `Tensor` of arbitrary
+      dimensions.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    value_tensor: A `Tensor` representing the current value of the metric.
+    update_op: An operation that accumulates the error from a batch of data.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'false_positives', (predictions, labels, weights)):
+
+    predictions = ops.convert_to_tensor(predictions)
+    labels = ops.convert_to_tensor(labels)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    is_false_positive = math_ops.logical_and(math_ops.equal(labels, 0),
+                                             math_ops.equal(predictions, 1))
+    return _count_condition(is_false_positive, weights, metrics_collections,
+                            updates_collections)
+
+
+def precision(labels, predictions, weights=None,
+              metrics_collections=None, updates_collections=None,
+              name=None):
+  """Computes the precision of the predictions with respect to the labels.
+
+  The `precision` function creates two local variables,
+  `true_positives` and `false_positives`, that are used to compute the
+  precision. This value is ultimately returned as `precision`, an idempotent
+  operation that simply divides `true_positives` by the sum of `true_positives`
+  and `false_positives`.
+
+  For estimation of the metric  over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision`. `update_op` weights each prediction by the corresponding value in
+  `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
+      match `predictions`.
+    predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `precision` should
+      be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    precision: Scalar float `Tensor` with the value of `true_positives`
+      divided by the sum of `true_positives` and `false_positives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_positives` variables appropriately and whose value matches
+      `precision`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'precision', (predictions, labels, weights)):
+
+    labels, predictions, weights = _remove_squeezable_dimensions(
+        labels, predictions, weights)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+    true_p, true_positives_update_op = true_positives(
+        labels, predictions, weights, metrics_collections=None,
+        updates_collections=None, name=None)
+    false_p, false_positives_update_op = false_positives(
+        labels, predictions, weights, metrics_collections=None,
+        updates_collections=None, name=None)
+
+    def compute_precision(name):
+      return math_ops.select(
+          math_ops.greater(true_p + false_p, 0),
+          math_ops.div(true_p, true_p + false_p),
+          0,
+          name)
+
+    p = compute_precision('value')
+    with ops.control_dependencies([true_positives_update_op,
+                                   false_positives_update_op]):
+      update_op = compute_precision('update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, p)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return p, update_op
+
+
+def precision_at_thresholds(labels, predictions, thresholds,
+                            weights=None,
+                            metrics_collections=None,
+                            updates_collections=None, name=None):
+  """Computes precision values for different `thresholds` on `predictions`.
+
+  The `precision_at_thresholds` function creates four local variables,
+  `true_positives`, `true_negatives`, `false_positives` and `false_negatives`
+  for various values of thresholds. `precision[i]` is defined as the total
+  weight of values in `predictions` above `thresholds[i]` whose corresponding
+  entry in `labels` is `True`, divided by the total weight of values in
+  `predictions` above `thresholds[i]` (`true_positives[i] / (true_positives[i] +
+  false_positives[i])`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    thresholds: A python list or tuple of float thresholds in `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `auc` should be
+      added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    precision: A float `Tensor` of shape `[len(thresholds)]`.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables that
+      are used in the computation of `precision`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(name, 'precision_at_thresholds',
+                                     (predictions, labels, weights)):
+    values, update_ops = _confusion_matrix_at_thresholds(
+        labels, predictions, thresholds, weights, includes=('tp', 'fp'))
+    tp = values['tp']
+    fp = values['fp']
+
+    # Avoid division by zero.
+    epsilon = 1e-7
+    def compute_precision(name):
+      return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
+
+    prec = compute_precision('value')
+    with ops.control_dependencies(update_ops.values()):
+      update_op = compute_precision('update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, prec)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return prec, update_op
+
+
+def false_negatives(labels, predictions, weights=None,
+                    metrics_collections=None,
+                    updates_collections=None,
+                    name=None):
+  """Computes the total number of false positives.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
+      match `predictions`.
+    predictions: The predicted values, a `bool` `Tensor` of arbitrary
+      dimensions.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    value_tensor: A `Tensor` representing the current value of the metric.
+    update_op: An operation that accumulates the error from a batch of data.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
+      or if either `metrics_collections` or `updates_collections` are not a list
+      or tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'false_negatives', (predictions, labels, weights)):
+
+    predictions = ops.convert_to_tensor(predictions)
+    labels = ops.convert_to_tensor(labels)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    is_false_negative = math_ops.logical_and(math_ops.equal(labels, 1),
+                                             math_ops.equal(predictions, 0))
+    return _count_condition(is_false_negative, weights, metrics_collections,
+                            updates_collections)
+
+
+def recall(labels, predictions, weights=None,
+           metrics_collections=None, updates_collections=None,
+           name=None):
+  """Computes the recall of the predictions with respect to the labels.
+
+  The `recall` function creates two local variables, `true_positives`
+  and `false_negatives`, that are used to compute the recall. This value is
+  ultimately returned as `recall`, an idempotent operation that simply divides
+  `true_positives` by the sum of `true_positives`  and `false_negatives`.
+
+  For estimation of the metric  over a stream of data, the function creates an
+  `update_op` that updates these variables and returns the `recall`. `update_op`
+  weights each prediction by the corresponding value in `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
+      match `predictions`.
+    predictions: The predicted values, a `bool` `Tensor` of arbitrary shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `recall` should
+      be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    recall: Scalar float `Tensor` with the value of `true_positives` divided
+      by the sum of `true_positives` and `false_negatives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_negatives` variables appropriately and whose value matches
+      `recall`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(
+      name, 'recall', (predictions, labels, weights)):
+    labels, predictions, weights = _remove_squeezable_dimensions(
+        labels, predictions, weights)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+
+    true_p, true_positives_update_op = true_positives(
+        labels, predictions, weights, metrics_collections=None,
+        updates_collections=None, name=None)
+    false_n, false_negatives_update_op = false_negatives(
+        labels, predictions, weights, metrics_collections=None,
+        updates_collections=None, name=None)
+
+    def compute_recall(true_p, false_n, name):
+      return math_ops.select(
+          math_ops.greater(true_p + false_n, 0),
+          math_ops.div(true_p, true_p + false_n),
+          0,
+          name)
+
+    rec = compute_recall(true_p, false_n, 'value')
+    with ops.control_dependencies([true_positives_update_op,
+                                   false_negatives_update_op]):
+      update_op = compute_recall(true_p, false_n, 'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, rec)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return rec, update_op
+
+
+def _at_k_name(name, k=None, class_id=None):
+  if k is not None:
+    name = '%s_at_%d' % (name, k)
+  else:
+    name = '%s_at_k' % (name)
+  if class_id is not None:
+    name = '%s_class%d' % (name, class_id)
+  return name
+
+
+def _select_class_id(ids, selected_id):
+  """Filter all but `selected_id` out of `ids`.
+
+  Args:
+    ids: `int64` `Tensor` or `SparseTensor` of IDs.
+    selected_id: Int id to select.
+
+  Returns:
+    `SparseTensor` of same dimensions as `ids`. This contains only the entries
+    equal to `selected_id`.
+  """
+  if isinstance(
+      ids, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+    return sparse_ops.sparse_retain(
+        ids, math_ops.equal(ids.values, selected_id))
+
+  # TODO(ptucker): Make this more efficient, maybe add a sparse version of
+  # tf.equal and tf.reduce_any?
+
+  # Shape of filled IDs is the same as `ids` with the last dim collapsed to 1.
+  ids_shape = array_ops.shape(ids, out_type=dtypes.int64)
+  ids_last_dim = array_ops.size(ids_shape) - 1
+  filled_selected_id_shape = math_ops.reduced_shape(
+      ids_shape, array_ops.reshape(ids_last_dim, [1]))
+
+  # Intersect `ids` with the selected ID.
+  filled_selected_id = array_ops.fill(
+      filled_selected_id_shape, math_ops.to_int64(selected_id))
+  result = sets.set_intersection(filled_selected_id, ids)
+  return sparse_tensor.SparseTensor(
+      indices=result.indices, values=result.values, shape=ids_shape)
+
+
+def _maybe_select_class_id(labels, predictions_idx, selected_id=None):
+  """If class ID is specified, filter all other classes.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: `int64` `Tensor` of class IDs, with shape [D1, ... DN, k]
+      where N >= 1. Commonly, N=1 and `predictions_idx` has shape
+      [batch size, k].
+    selected_id: Int id to select.
+
+  Returns:
+    Tuple of `labels` and `predictions_idx`, possibly with classes removed.
+  """
+  if selected_id is None:
+    return labels, predictions_idx
+  return (_select_class_id(labels, selected_id),
+          _select_class_id(predictions_idx, selected_id))
+
+
+def _sparse_true_positive_at_k(labels,
+                               predictions_idx,
+                               class_id=None,
+                               weights=None,
+                               name=None):
+  """Calculates true positives for recall@k and precision@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels_sparse`.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+    name: Name of operation.
+
+  Returns:
+    A [D1, ... DN] `Tensor` of true positive counts.
+  """
+  with ops.name_scope(name, 'true_positives', (predictions_idx, labels)):
+    labels, predictions_idx = _maybe_select_class_id(
+        labels, predictions_idx, class_id)
+    tp = sets.set_size(sets.set_intersection(predictions_idx, labels))
+    tp = math_ops.to_double(tp)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      tp = math_ops.mul(tp, weights)
+    return tp
+
+
+def _streaming_sparse_true_positive_at_k(labels,
+                                         predictions_idx,
+                                         k=None,
+                                         class_id=None,
+                                         weights=None,
+                                         name=None):
+  """Calculates weighted per step true positives for recall@k and precision@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    k: Integer, k for @k metric. This is only used for default op name.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+    name: Name of new variable, and namespace for other dependent ops.
+
+  Returns:
+    A tuple of `Variable` and update `Operation`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and has an incomptable shape.
+  """
+  default_name = _at_k_name('true_positive', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions_idx, labels)) as scope:
+    tp = _sparse_true_positive_at_k(
+        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        weights=weights)
+    batch_total_tp = math_ops.to_double(math_ops.reduce_sum(tp))
+
+    var = _local_variable(array_ops.zeros([], dtype=dtypes.float64), name=scope)
+    return var, state_ops.assign_add(var, batch_total_tp, name='update')
+
+
+def _sparse_false_negative_at_k(labels,
+                                predictions_idx,
+                                class_id=None,
+                                weights=None):
+  """Calculates false negatives for recall@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels_sparse`.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+
+  Returns:
+    A [D1, ... DN] `Tensor` of false negative counts.
+  """
+  with ops.name_scope(None, 'false_negatives', (predictions_idx, labels)):
+    labels, predictions_idx = _maybe_select_class_id(labels,
+                                                     predictions_idx,
+                                                     class_id)
+    fn = sets.set_size(sets.set_difference(predictions_idx,
+                                           labels,
+                                           aminusb=False))
+    fn = math_ops.to_double(fn)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      fn = math_ops.mul(fn, weights)
+    return fn
+
+
+def _streaming_sparse_false_negative_at_k(labels,
+                                          predictions_idx,
+                                          k,
+                                          class_id=None,
+                                          weights=None,
+                                          name=None):
+  """Calculates weighted per step false negatives for recall@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    k: Integer, k for @k metric. This is only used for default op name.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+    name: Name of new variable, and namespace for other dependent ops.
+
+  Returns:
+    A tuple of `Variable` and update `Operation`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and has an incomptable shape.
+  """
+  default_name = _at_k_name('false_negative', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions_idx, labels)) as scope:
+    fn = _sparse_false_negative_at_k(
+        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        weights=weights)
+    batch_total_fn = math_ops.to_double(math_ops.reduce_sum(fn))
+
+    var = _local_variable(array_ops.zeros([], dtype=dtypes.float64), name=scope)
+    return var, state_ops.assign_add(var, batch_total_fn, name='update')
+
+
+def recall_at_k(labels,
+                predictions,
+                k,
+                class_id=None,
+                weights=None,
+                metrics_collections=None,
+                updates_collections=None,
+                name=None):
+  """Computes recall@k of the predictions with respect to sparse labels.
+
+  If `class_id` is specified, we calculate recall by considering only the
+      entries in the batch for which `class_id` is in the label, and computing
+      the fraction of them for which `class_id` is in the top-k `predictions`.
+  If `class_id` is not specified, we'll calculate recall as how often on
+      average a class among the labels of a batch entry is in the top-k
+      `predictions`.
+
+  `sparse_recall_at_k` creates two local variables,
+  `true_positive_at_<k>` and `false_negative_at_<k>`, that are used to compute
+  the recall_at_k frequency. This frequency is ultimately returned as
+  `recall_at_<k>`: an idempotent operation that simply divides
+  `true_positive_at_<k>` by total (`true_positive_at_<k>` +
+  `false_negative_at_<k>`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `recall_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+  indicating the top `k` `predictions`. Set operations applied to `top_k` and
+  `labels` calculate the true positives and false negatives weighted by
+  `weights`. Then `update_op` increments `true_positive_at_<k>` and
+  `false_negative_at_<k>` using these values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match `predictions`.
+      Values should be in range [0, num_classes), where num_classes is the last
+      dimension of `predictions`. Values outside this range always count
+      towards `false_negative_at_<k>`.
+    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
+      N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
+      The final dimension contains the logit values for each class. [D1, ... DN]
+      must match `labels`.
+    k: Integer, k for @k metric.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes), where num_classes is the last dimension of
+      `predictions`. If class_id is outside this range, the method returns NAN.
+    weights: An optional `Tensor` whose shape is broadcastable to the first
+      [D1, ... DN] dimensions of `predictions` and `labels`.
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    recall: Scalar `float64` `Tensor` with the value of `true_positives` divided
+      by the sum of `true_positives` and `false_negatives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_negatives` variables appropriately, and whose value matches
+      `recall`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
+  """
+  default_name = _at_k_name('recall', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
+    _, top_k_idx = nn.top_k(predictions, k)
+    top_k_idx = math_ops.to_int64(top_k_idx)
+    tp, tp_update = _streaming_sparse_true_positive_at_k(
+        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        weights=weights)
+    fn, fn_update = _streaming_sparse_false_negative_at_k(
+        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        weights=weights)
+
+    metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
+    update = math_ops.div(
+        tp_update, math_ops.add(tp_update, fn_update), name='update')
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, metric)
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update)
+    return metric, update
+
+
+def recall_at_thresholds(labels, predictions, thresholds,
+                         weights=None, metrics_collections=None,
+                         updates_collections=None, name=None):
+  """Computes various recall values for different `thresholds` on `predictions`.
+
+  The `recall_at_thresholds` function creates four local variables,
+  `true_positives`, `true_negatives`, `false_positives` and `false_negatives`
+  for various values of thresholds. `recall[i]` is defined as the total weight
+  of values in `predictions` above `thresholds[i]` whose corresponding entry in
+  `labels` is `True`, divided by the total weight of `True` values in `labels`
+  (`true_positives[i] / (true_positives[i] + false_negatives[i])`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the `recall`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    thresholds: A python list or tuple of float thresholds in `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that `recall` should be
+      added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    recall: A float `Tensor` of shape `[len(thresholds)]`.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables that
+      are used in the computation of `recall`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  with variable_scope.variable_scope(name, 'recall_at_thresholds',
+                                     (predictions, labels, weights)):
+    values, update_ops = _confusion_matrix_at_thresholds(
+        labels, predictions, thresholds, weights, includes=('tp', 'fn'))
+    tp = values['tp']
+    fn = values['fn']
+
+    # Avoid division by zero.
+    epsilon = 1e-7
+    def compute_recall(name):
+      return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
+
+    rec = compute_recall('value')
+    with ops.control_dependencies(update_ops.values()):
+      update_op = compute_recall('update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, rec)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return rec, update_op
+
+
+def root_mean_squared_error(labels, predictions, weights=None,
+                            metrics_collections=None,
+                            updates_collections=None,
+                            name=None):
+  """Computes the root mean squared error between the labels and predictions.
+
+  The `root_mean_squared_error` function creates two local variables,
+  `total` and `count` that are used to compute the root mean squared error.
+  This average is weighted by `weights`, and it is ultimately returned as
+  `root_mean_squared_error`: an idempotent operation that takes the square root
+  of the division of `total` by `count`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `root_mean_squared_error`. Internally, a `squared_error` operation computes
+  the element-wise square of the difference between `predictions` and `labels`.
+  Then `update_op` increments `total` with the reduced sum of the product of
+  `weights` and `squared_error`, and it increments `count` with the reduced sum
+  of `weights`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: A `Tensor` of the same shape as `predictions`.
+    predictions: A `Tensor` of arbitrary shape.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    metrics_collections: An optional list of collections that
+      `root_mean_squared_error` should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    root_mean_squared_error: A `Tensor` representing the current mean, the value
+      of `total` divided by `count`.
+    update_op: An operation that increments the `total` and `count` variables
+      appropriately and whose value matches `root_mean_squared_error`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+  """
+  labels, predictions, weights = _remove_squeezable_dimensions(
+      labels, predictions, weights)
+  predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+  value_tensor, update_op = mean_squared_error(
+      labels, predictions, weights, None, None,
+      name or 'root_mean_squared_error')
+
+  rmse = math_ops.sqrt(value_tensor)
+  with ops.control_dependencies([update_op]):
+    update_op = math_ops.sqrt(update_op)
+
+  if metrics_collections:
+    ops.add_to_collections(metrics_collections, rmse)
+
+  if updates_collections:
+    ops.add_to_collections(updates_collections, update_op)
+
+  return rmse, update_op
+
+
+def sensitivity_at_specificity(
+    labels, predictions, specificity, weights=None, num_thresholds=200,
+    metrics_collections=None, updates_collections=None, name=None):
+  """Computes the specificity at a given sensitivity.
+
+  The `sensitivity_at_specificity` function creates four local
+  variables, `true_positives`, `true_negatives`, `false_positives` and
+  `false_negatives` that are used to compute the sensitivity at the given
+  specificity value. The threshold for the given specificity value is computed
+  and used to evaluate the corresponding sensitivity.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `sensitivity`. `update_op` increments the `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` counts with the weight of each case
+  found in the `predictions` and `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+
+  Args:
+    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    specificity: A scalar value in range `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    num_thresholds: The number of thresholds to use for matching the given
+      specificity.
+    metrics_collections: An optional list of collections that `sensitivity`
+      should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    sensitivity: A scalar `Tensor` representing the sensitivity at the given
+      `specificity` value.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables
+      appropriately and whose value matches `sensitivity`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      `specificity` is not between 0 and 1, or if either `metrics_collections`
+      or `updates_collections` are not a list or tuple.
+  """
+  if specificity < 0 or specificity > 1:
+    raise ValueError('`specificity` must be in the range [0, 1].')
+
+  with variable_scope.variable_scope(name, 'sensitivity_at_specificity',
+                                     (predictions, labels, weights)):
+    kepsilon = 1e-7  # to account for floating point imprecisions
+    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                  for i in range(num_thresholds-2)]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+    values, update_ops = _confusion_matrix_at_thresholds(
+        labels, predictions, thresholds, weights)
+    tp = values['tp']
+    fn = values['fn']
+    tn = values['tn']
+    fp = values['fp']
+
+    def compute_sensitivity_at_specificity(name):
+      specificities = math_ops.div(tn, tn + fp + kepsilon)
+      tf_index = math_ops.argmin(math_ops.abs(specificities - specificity), 0)
+      tf_index = math_ops.cast(tf_index, dtypes.int32)
+
+      # Now, we have the implicit threshold, so compute the sensitivity:
+      return math_ops.div(tp[tf_index],
+                          tp[tf_index] + fn[tf_index] + kepsilon,
+                          name)
+
+    sensitivity = compute_sensitivity_at_specificity('value')
+    with ops.control_dependencies(update_ops.values()):
+      update_op = compute_sensitivity_at_specificity('update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, sensitivity)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return sensitivity, update_op
+
+
+def _expand_and_tile(tensor, multiple, dim=0, name=None):
+  """Slice `tensor` shape in 2, then tile along the sliced dimension.
+
+  A new dimension is inserted in shape of `tensor` before `dim`, then values are
+  tiled `multiple` times along the new dimension.
+
+  Args:
+    tensor: Input `Tensor` or `SparseTensor`.
+    multiple: Integer, number of times to tile.
+    dim: Integer, dimension along which to tile.
+    name: Name of operation.
+
+  Returns:
+    `Tensor` result of expanding and tiling `tensor`.
+
+  Raises:
+    ValueError: if `multiple` is less than 1, or `dim` is not in
+    `[-rank(tensor), rank(tensor)]`.
+  """
+  if multiple < 1:
+    raise ValueError('Invalid multiple %s, must be > 0.' % multiple)
+  with ops.name_scope(
+      name, 'expand_and_tile', (tensor, multiple, dim)) as scope:
+    # Sparse.
+    if isinstance(tensor, sparse_tensor.SparseTensorValue):
+      tensor = sparse_tensor.SparseTensor.from_value(tensor)
+    if isinstance(tensor, sparse_tensor.SparseTensor):
+      if dim < 0:
+        expand_dims = array_ops.reshape(
+            array_ops.size(tensor.shape) + dim, [1])
+      else:
+        expand_dims = [dim]
+      expanded_shape = array_ops.concat(
+          0, (array_ops.slice(tensor.shape, [0], expand_dims), [1],
+              array_ops.slice(tensor.shape, expand_dims, [-1])),
+          name='expanded_shape')
+      expanded = sparse_ops.sparse_reshape(
+          tensor, shape=expanded_shape, name='expand')
+      if multiple == 1:
+        return expanded
+      return sparse_ops.sparse_concat(
+          dim - 1 if dim < 0 else dim, [expanded] * multiple, name=scope)
+
+    # Dense.
+    expanded = array_ops.expand_dims(
+        tensor, dim if (dim >= 0) else (dim - 1), name='expand')
+    if multiple == 1:
+      return expanded
+    ones = array_ops.ones_like(array_ops.shape(tensor))
+    tile_multiples = array_ops.concat(
+        0, (ones[:dim], (multiple,), ones[dim:]), name='multiples')
+    return array_ops.tile(expanded, tile_multiples, name=scope)
+
+
+def _num_relevant(labels, k):
+  """Computes number of relevant values for each row in labels.
+
+  For labels with shape [D1, ... DN, num_labels], this is the minimum of
+  `num_labels` and `k`.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels].
+    k: Integer, k for @k metric.
+
+  Returns:
+    Integer `Tensor` of shape [D1, ... DN], where each value is the number of
+    relevant values for that row.
+
+  Raises:
+    ValueError: if inputs have invalid dtypes or values.
+  """
+  if k < 1:
+    raise ValueError('Invalid k=%s.' % k)
+  with ops.name_scope(None, 'num_relevant', (labels,)) as scope:
+    # For SparseTensor, calculate separate count for each row.
+    if isinstance(
+        labels, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+      labels_sizes = sets.set_size(labels)
+      return math_ops.minimum(labels_sizes, k, name=scope)
+
+    # For dense Tensor, calculate scalar count based on last dimension, and
+    # tile across labels shape.
+    labels_shape = array_ops.shape(labels)
+    labels_size = labels_shape[-1]
+    num_relevant_scalar = math_ops.minimum(labels_size, k)
+    return array_ops.fill(labels_shape[0:-1], num_relevant_scalar, name=scope)
+
+
+def _sparse_average_precision_at_k(labels, predictions, k):
+  """Computes average precision@k of predictions with respect to sparse labels.
+
+  From en.wikipedia.org/wiki/Information_retrieval#Average_precision, formula
+  for each row is:
+
+    AveP = sum_{i=1...k} P_{i} * rel_{i} / num_relevant_items
+
+  A "row" is the elements in dimension [D1, ... DN] of `predictions`, `labels`,
+  and the result `Tensors`. In the common case, this is [batch_size]. Each row
+  of the results contains the average precision for that row.
+
+  Internally, a `top_k` operation computes a `Tensor` indicating the top `k`
+  `predictions`. Set operations applied to `top_k` and `labels` calculate the
+  true positives, which are used to calculate the precision ("P_{i}" term,
+  above).
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range are ignored.
+    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
+      N >= 1. Commonly, N=1 and `predictions` has shape
+      [batch size, num_classes]. The final dimension contains the logit values
+      for each class. [D1, ... DN] must match `labels`.
+    k: Integer, k for @k metric. This will calculate an average precision for
+      range `[1,k]`, as documented above.
+
+  Returns:
+    `float64` `Tensor` of shape [D1, ... DN], where each value is the average
+    precision for that row.
+
+  Raises:
+    ValueError: if k is invalid.
+  """
+  if k < 1:
+    raise ValueError('Invalid k=%s.' % k)
+  with ops.name_scope(
+      None, 'average_precision', (predictions, labels, k)) as scope:
+    # Calculate top k indices to produce [D1, ... DN, k] tensor.
+    _, predictions_idx = nn.top_k(predictions, k)
+    predictions_idx = math_ops.to_int64(predictions_idx, name='predictions_idx')
+
+    # Expand dims to produce [D1, ... DN, k, 1] tensor. This gives us a separate
+    # prediction for each k, so we can calculate separate true positive values
+    # for each k.
+    predictions_idx_per_k = array_ops.expand_dims(
+        predictions_idx, -1, name='predictions_idx_per_k')
+
+    # Replicate labels k times to produce [D1, ... DN, k, num_labels] tensor.
+    labels_per_k = _expand_and_tile(
+        labels, multiple=k, dim=-1, name='labels_per_k')
+
+    # The following tensors are all of shape [D1, ... DN, k], containing values
+    # per row, per k value.
+    # `relevant_per_k` (int32) - Relevance indicator, 1 if the prediction at
+    #     that k value is correct, 0 otherwise. This is the "rel_{i}" term from
+    #     the formula above.
+    # `tp_per_k` (int32) - True positive counts.
+    # `retrieved_per_k` (int32) - Number of predicted values at each k. This is
+    #     the precision denominator.
+    # `precision_per_k` (float64) - Precision at each k. This is the "P_{i}"
+    #     term from the formula above.
+    # `relevant_precision_per_k` (float64) - Relevant precisions; i.e.,
+    #     precisions at all k for which relevance indicator is true.
+    relevant_per_k = _sparse_true_positive_at_k(
+        predictions_idx_per_k, labels_per_k, name='relevant_per_k')
+    tp_per_k = math_ops.cumsum(relevant_per_k, axis=-1, name='tp_per_k')
+    retrieved_per_k = math_ops.cumsum(
+        array_ops.ones_like(relevant_per_k), axis=-1, name='retrieved_per_k')
+    precision_per_k = math_ops.div(
+        math_ops.to_double(tp_per_k), math_ops.to_double(retrieved_per_k),
+        name='precision_per_k')
+    relevant_precision_per_k = math_ops.mul(
+        precision_per_k, math_ops.to_double(relevant_per_k),
+        name='relevant_precision_per_k')
+
+    # Reduce along k dimension to get the sum, yielding a [D1, ... DN] tensor.
+    precision_sum = math_ops.reduce_sum(
+        relevant_precision_per_k, reduction_indices=(-1,), name='precision_sum')
+
+    # Divide by number of relevant items to get average precision. These are
+    # the "num_relevant_items" and "AveP" terms from the formula above.
+    num_relevant_items = math_ops.to_double(_num_relevant(labels, k))
+    return math_ops.div(precision_sum, num_relevant_items, name=scope)
+
+
+def sparse_average_precision_at_k(labels,
+                                  predictions,
+                                  k,
+                                  weights=None,
+                                  metrics_collections=None,
+                                  updates_collections=None,
+                                  name=None):
+  """Computes average precision@k of predictions with respect to sparse labels.
+
+  `sparse_average_precision_at_k` creates two local variables,
+  `average_precision_at_<k>/total` and `average_precision_at_<k>/max`, that
+  are used to compute the frequency. This frequency is ultimately returned as
+  `average_precision_at_<k>`: an idempotent operation that simply divides
+  `average_precision_at_<k>/total` by `average_precision_at_<k>/max`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+  indicating the top `k` `predictions`. Set operations applied to `top_k` and
+  `labels` calculate the true positives and false positives weighted by
+  `weights`. Then `update_op` increments `true_positive_at_<k>` and
+  `false_positive_at_<k>` using these values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range are ignored.
+    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
+      N >= 1. Commonly, N=1 and `predictions` has shape
+      [batch size, num_classes]. The final dimension contains the logit values
+      for each class. [D1, ... DN] must match `labels`.
+    k: Integer, k for @k metric. This will calculate an average precision for
+      range `[1,k]`, as documented above.
+    weights: An optional `Tensor` whose shape is broadcastable to the first
+      [D1, ... DN] dimensions of `predictions` and `labels`.
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    mean_average_precision: Scalar `float64` `Tensor` with the mean average
+      precision values.
+    update: `Operation` that increments  variables appropriately, and whose
+      value matches `metric`.
+  """
+  default_name = _at_k_name('average_precision', k)
+  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
+    # Calculate per-example average precision, and apply weights.
+    average_precision = _sparse_average_precision_at_k(
+        predictions=predictions, labels=labels, k=k)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      average_precision = math_ops.mul(average_precision, weights)
+
+    # Create accumulation variables and update ops for max average precision and
+    # total average precision.
+    with ops.name_scope(None, 'max', (average_precision,)) as max_scope:
+      # `max` is the max possible precision. Since max for any row is 1.0:
+      # - For the unweighted case, this is just the number of rows.
+      # - For the weighted case, it's the sum of the weights broadcast across
+      #   `average_precision` rows.
+      max_var = _local_variable(
+          array_ops.zeros([], dtype=dtypes.float64), name=max_scope)
+      if weights is None:
+        batch_max = math_ops.to_double(
+            array_ops.size(average_precision, name='batch_max'))
+      else:
+        # TODO(ptucker): More efficient way to broadcast?
+        broadcast_weights = math_ops.mul(
+            weights, array_ops.ones_like(average_precision),
+            name='broadcast_weights')
+        batch_max = math_ops.reduce_sum(broadcast_weights, name='batch_max')
+      max_update = state_ops.assign_add(max_var, batch_max, name='update')
+    with ops.name_scope(None, 'total', (average_precision,)) as total_scope:
+      total_var = _local_variable(
+          array_ops.zeros([], dtype=dtypes.float64), name=total_scope)
+      batch_total = math_ops.reduce_sum(average_precision, name='batch_total')
+      total_update = state_ops.assign_add(total_var, batch_total, name='update')
+
+    # Divide total by max to get mean, for both vars and the update ops.
+    mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
+    update = _safe_scalar_div(total_update, max_update, name=scope)
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_average_precision)
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update)
+
+    return mean_average_precision, update
+
+
+def _sparse_false_positive_at_k(labels,
+                                predictions_idx,
+                                class_id=None,
+                                weights=None):
+  """Calculates false positives for precision@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels_sparse`.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+
+  Returns:
+    A [D1, ... DN] `Tensor` of false positive counts.
+  """
+  with ops.name_scope(None, 'false_positives', (predictions_idx, labels)):
+    labels, predictions_idx = _maybe_select_class_id(labels,
+                                                     predictions_idx,
+                                                     class_id)
+    fp = sets.set_size(sets.set_difference(
+        predictions_idx, labels, aminusb=True))
+    fp = math_ops.to_double(fp)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      fp = math_ops.mul(fp, weights)
+    return fp
+
+
+def _streaming_sparse_false_positive_at_k(labels,
+                                          predictions_idx,
+                                          k=None,
+                                          class_id=None,
+                                          weights=None,
+                                          name=None):
+  """Calculates weighted per step false positives for precision@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    k: Integer, k for @k metric. This is only used for default op name.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+    name: Name of new variable, and namespace for other dependent ops.
+
+  Returns:
+    A tuple of `Variable` and update `Operation`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and has an incomptable shape.
+  """
+  default_name = _at_k_name('false_positive', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions_idx, labels)) as scope:
+    fp = _sparse_false_positive_at_k(
+        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        weights=weights)
+    batch_total_fp = math_ops.to_double(math_ops.reduce_sum(fp))
+
+    var = _local_variable(array_ops.zeros([], dtype=dtypes.float64), name=scope)
+    return var, state_ops.assign_add(var, batch_total_fp, name='update')
+
+
+def _sparse_precision_at_k(labels,
+                           top_k_idx,
+                           k=None,
+                           class_id=None,
+                           weights=None,
+                           metrics_collections=None,
+                           updates_collections=None,
+                           name=None):
+  """Computes precision@k of the top-k indices with respect to sparse labels.
+
+  This method contains the code shared by streaming_sparse_precision_at_k and
+  streaming_sparse_precision_at_top_k. Refer to those methods for more details.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range are ignored.
+    top_k_idx: Integer `Tensor` with shape [D1, ... DN, k] where
+      N >= 1. Commonly, N=1 and top_k_idx has shape [batch size, k].
+      The final dimension contains the indices of top-k labels. [D1, ... DN]
+      must match `labels`.
+    k: Integer, k for @k metric or `None`. Only used for default op name.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes), where num_classes is the last dimension of
+      `predictions`. If `class_id` is outside this range, the method returns
+      NAN.
+    weights: An optional `Tensor` whose shape is broadcastable to the first
+      [D1, ... DN] dimensions of `predictions` and `labels`.
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of the metric and of the enclosing scope.
+
+  Returns:
+    precision: Scalar `float64` `Tensor` with the value of `true_positives`
+      divided by the sum of `true_positives` and `false_positives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_positives` variables appropriately, and whose value matches
+      `precision`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+      `predictions`, or if either `metrics_collections` or `updates_collections`
+      are not a list or tuple.
+  """
+  top_k_idx = math_ops.to_int64(top_k_idx)
+  tp, tp_update = _streaming_sparse_true_positive_at_k(
+      predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+      weights=weights)
+  fp, fp_update = _streaming_sparse_false_positive_at_k(
+      predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+      weights=weights)
+
+  metric = math_ops.div(tp, math_ops.add(tp, fp), name=name)
+  update = math_ops.div(
+      tp_update, math_ops.add(tp_update, fp_update), name='update')
+  if metrics_collections:
+    ops.add_to_collections(metrics_collections, metric)
+  if updates_collections:
+    ops.add_to_collections(updates_collections, update)
+  return metric, update
+
+
+def sparse_precision_at_k(labels,
+                          predictions,
+                          k,
+                          class_id=None,
+                          weights=None,
+                          metrics_collections=None,
+                          updates_collections=None,
+                          name=None):
+  """Computes precision@k of the predictions with respect to sparse labels.
+
+  If `class_id` is specified, we calculate precision by considering only the
+      entries in the batch for which `class_id` is in the top-k highest
+      `predictions`, and computing the fraction of them for which `class_id` is
+      indeed a correct label.
+  If `class_id` is not specified, we'll calculate precision as how often on
+      average a class among the top-k classes with the highest predicted values
+      of a batch entry is correct and can be found in the label for that entry.
+
+  `sparse_precision_at_k` creates two local variables,
+  `true_positive_at_<k>` and `false_positive_at_<k>`, that are used to compute
+  the precision@k frequency. This frequency is ultimately returned as
+  `precision_at_<k>`: an idempotent operation that simply divides
+  `true_positive_at_<k>` by total (`true_positive_at_<k>` +
+  `false_positive_at_<k>`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+  indicating the top `k` `predictions`. Set operations applied to `top_k` and
+  `labels` calculate the true positives and false positives weighted by
+  `weights`. Then `update_op` increments `true_positive_at_<k>` and
+  `false_positive_at_<k>` using these values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range are ignored.
+    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
+      N >= 1. Commonly, N=1 and predictions has shape [batch size, num_classes].
+      The final dimension contains the logit values for each class. [D1, ... DN]
+      must match `labels`.
+    k: Integer, k for @k metric.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes], where num_classes is the last dimension of
+      `predictions`. If `class_id` is outside this range, the method returns
+      NAN.
+    weights: An optional `Tensor` whose shape is broadcastable to the first
+      [D1, ... DN] dimensions of `predictions` and `labels`.
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    precision: Scalar `float64` `Tensor` with the value of `true_positives`
+      divided by the sum of `true_positives` and `false_positives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_positives` variables appropriately, and whose value matches
+      `precision`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+      `predictions`, or if either `metrics_collections` or `updates_collections`
+      are not a list or tuple.
+  """
+  default_name = _at_k_name('precision', k, class_id=class_id)
+  with ops.name_scope(name, default_name,
+                      (predictions, labels, weights)) as scope:
+    _, top_k_idx = nn.top_k(predictions, k)
+    return _sparse_precision_at_k(
+        top_k_idx=top_k_idx,
+        labels=labels,
+        k=k,
+        class_id=class_id,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=scope)
+
+
+def specificity_at_sensitivity(
+    labels, predictions, sensitivity, weights=None, num_thresholds=200,
+    metrics_collections=None, updates_collections=None, name=None):
+  """Computes the specificity at a given sensitivity.
+
+  The `specificity_at_sensitivity` function creates four local
+  variables, `true_positives`, `true_negatives`, `false_positives` and
+  `false_negatives` that are used to compute the specificity at the given
+  sensitivity value. The threshold for the given sensitivity value is computed
+  and used to evaluate the corresponding specificity.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `specificity`. `update_op` increments the `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` counts with the weight of each case
+  found in the `predictions` and `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+
+  Args:
+    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    sensitivity: A scalar value in range `[0, 1]`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
+    num_thresholds: The number of thresholds to use for matching the given
+      sensitivity.
+    metrics_collections: An optional list of collections that `specificity`
+      should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    specificity: A scalar `Tensor` representing the specificity at the given
+      `specificity` value.
+    update_op: An operation that increments the `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` variables
+      appropriately and whose value matches `specificity`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      `sensitivity` is not between 0 and 1, or if either `metrics_collections`
+      or `updates_collections` are not a list or tuple.
+  """
+  if sensitivity < 0 or sensitivity > 1:
+    raise ValueError('`sensitivity` must be in the range [0, 1].')
+
+  with variable_scope.variable_scope(name, 'specificity_at_sensitivity',
+                                     (predictions, labels, weights)):
+    kepsilon = 1e-7  # to account for floating point imprecisions
+    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                  for i in range(num_thresholds-2)]
+    thresholds = [0.0 - kepsilon] + thresholds + [1.0 - kepsilon]
+
+    values, update_ops = _confusion_matrix_at_thresholds(
+        labels, predictions, thresholds, weights)
+    tp = values['tp']
+    fn = values['fn']
+    tn = values['tn']
+    fp = values['fp']
+
+    def compute_specificity_at_sensitivity(name):
+      """Computes the specificity at the given sensitivity.
+
+      Args:
+        name: The name of the operation.
+
+      Returns:
+        The specificity using the aggregated values.
+      """
+      sensitivities = math_ops.div(tp, tp + fn + kepsilon)
+
+      # We'll need to use this trick until tf.argmax allows us to specify
+      # whether we should use the first or last index in case of ties.
+      min_val = math_ops.reduce_min(math_ops.abs(sensitivities - sensitivity))
+      indices_at_minval = math_ops.equal(
+          math_ops.abs(sensitivities - sensitivity), min_val)
+      indices_at_minval = math_ops.to_int64(indices_at_minval)
+      indices_at_minval = math_ops.cumsum(indices_at_minval)
+      tf_index = math_ops.argmax(indices_at_minval, 0)
+      tf_index = math_ops.cast(tf_index, dtypes.int32)
+
+      # Now, we have the implicit threshold, so compute the specificity:
+      return math_ops.div(tn[tf_index],
+                          tn[tf_index] + fp[tf_index] + kepsilon,
+                          name)
+
+    specificity = compute_specificity_at_sensitivity('value')
+    with ops.control_dependencies(update_ops.values()):
+      update_op = compute_specificity_at_sensitivity('update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, specificity)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return specificity, update_op
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 73208a350b9..cc94cf1f38e 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops.check_ops import *
 from tensorflow.python.ops.clip_ops import *
 from tensorflow.python.ops.special_math_ops import *
 # TODO(vrv): Switch to import * once we're okay with exposing the module.
+from tensorflow.python.ops.confusion_matrix import confusion_matrix
 from tensorflow.python.ops.control_flow_ops import Assert
 from tensorflow.python.ops.control_flow_ops import group
 from tensorflow.python.ops.control_flow_ops import no_op
@@ -91,6 +92,7 @@ from tensorflow.python.framework import constant_op as _constant_op
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import check_ops as _check_ops
 from tensorflow.python.ops import clip_ops as _clip_ops
+from tensorflow.python.ops import confusion_matrix as _confusion_matrix
 from tensorflow.python.ops import control_flow_ops as _control_flow_ops
 from tensorflow.python.ops import data_flow_ops as _data_flow_ops
 from tensorflow.python.ops import functional_ops as _functional_ops
@@ -244,6 +246,7 @@ _allowed_symbols_misc = [
     "parse_single_sequence_example",
     "serialize_many_sparse",
     "serialize_sparse",
+    "confusion_matrix",
 ]
 
 _allowed_symbols = (_allowed_symbols_array_ops +
@@ -262,6 +265,7 @@ remove_undocumented(__name__, _allowed_symbols,
                      _array_ops,
                      _check_ops,
                      _clip_ops,
+                     _confusion_matrix,
                      _control_flow_ops,
                      _constant_op,
                      _data_flow_ops,