Merge commit for internal changes

2016-03-18 22:10:29 -07:00 · 2016-03-18 22:10:29 -07:00 · bf589e3da5
commit bf589e3da5
parent 4932775ecd 4671953808
139 changed files with 6589 additions and 2541 deletions
--- a/eigen.BUILD
+++ b/eigen.BUILD
@ -1,6 +1,6 @@
 package(default_visibility = ["//visibility:public"])

-archive_dir = "eigen-eigen-db7b61411772"
+archive_dir = "eigen-eigen-0a13bf3e579d"

 cc_library(
    name = "eigen",
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@ -24,6 +24,14 @@ py_library(
    ],
 )

+cc_library(
+    name = "contrib_kernels",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/linear_optimizer/kernels:sdca_ops",
+    ],
+)
+
 filegroup(
    name = "all_files",
    srcs = glob(
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@ -211,6 +211,18 @@ class FullyConnectedTest(tf.test.TestCase):
                     tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
    self.assertEqual(1, cnt[0])

+  def test_empty_x_results_in_empty_output(self):
+    # Empty x is common if someone masks their input with tf.boolean_mask in
+    # order to drop missing entries, and in a particular batch all entries are
+    # missing.
+    with self.test_session():
+      x = tf.constant([[]], shape=[0, 3])
+      self.assertEqual(0, tf.size(x).eval())
+      y = tf.contrib.layers.fully_connected(x, 2, activation_fn=tf.nn.softmax)
+      tf.initialize_all_variables().run()
+      expected_y = np.array([]).reshape(0,2)
+      np.testing.assert_array_equal(expected_y, y.eval())
+

 class Convolution2dTest(tf.test.TestCase):

--- a/tensorflow/contrib/layers/python/ops/loss_ops.py
+++ b/tensorflow/contrib/layers/python/ops/loss_ops.py
@ -22,16 +22,17 @@ These loss ops are, by design, minimal, enabling flexibility in how
 their output can be used.

@@reduce_batch_sum
-@@reduce_batch_mean

@@absolute_loss
@@squared_loss
+@@logistic_loss

+@@sum_absolute_loss
@@sum_squared_loss
-@@mean_absolute_loss
-@@mean_squared_loss
-@@root_mean_squared_loss
+@@sum_logistic_loss

+@@scalar_absolute_loss
+@@scalar_squared_loss
@@scalar_logistic_loss
 """

@ -39,14 +40,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from tensorflow.contrib.layers.python.framework import tensor_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn

-__all__ = ["reduce_batch_sum", "reduce_batch_mean", "absolute_loss",
-           "squared_loss", "sum_squared_loss", "mean_absolute_loss",
-           "mean_squared_loss", "root_mean_squared_loss",
+__all__ = ["reduce_batch_sum", "absolute_loss", "squared_loss", "logistic_loss",
+           "sum_absolute_loss", "sum_squared_loss", "sum_logistic_loss",
+           "scalar_absolute_loss", "scalar_squared_loss",
           "scalar_logistic_loss"]


@ -120,31 +122,11 @@ def reduce_batch_sum(x, name=None):
  return _reduce_batch(x, math_ops.reduce_sum, name)


-def reduce_batch_mean(x, name=None):
-  """Given a tensor `x`, returns the mean across all dimensions except dim 0.
-
-  Given a tensor with the number of dimensions > 1, reduce_batch_mean
-  will calculate the mean across all dimensions except for dimension
-  0. This function is useful for calculating the mean loss (error)
-  across all examples in a batch when training. As an example, given a
-  tensor of shape [batch_size, d1, d2], this function will calculate
-  the mean across dimensions d1 and d2, returning a tensor of shape
-  [batch_size].
-
-  Tensors of dimension 1 are returned as-is.
-
-  Args:
-    x: A `Tensor` with dimension > 0.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` with values averaged across all dimensions > 0.
-
-  Raises:
-    ValueError: If `x` has dimension 0.
-
-  """
-  return _reduce_batch(x, math_ops.reduce_mean, name)
+def _validate_predicted_and_target(predicted, target):
+  # TODO(ptucker): Optionally add assert op for shape check, for cases when
+  # shape is not fully defined at graph construction time?
+  predicted.get_shape().assert_is_compatible_with(target.get_shape())
+  tensor_util.assert_same_float_dtype([predicted, target])


 def absolute_loss(predicted, target, name=None):
@ -172,12 +154,12 @@ def absolute_loss(predicted, target, name=None):
  with ops.op_scope([predicted, target], name, "absolute_loss") as scope:
    predicted = ops.convert_to_tensor(predicted, name="predicted")
    target = ops.convert_to_tensor(target, name="target")
-    predicted.get_shape().assert_is_compatible_with(target.get_shape())
+    _validate_predicted_and_target(predicted, target)
    return math_ops.abs(target - predicted, name=scope)


 def squared_loss(predicted, target, name=None):
-  """Computes and returns the per-example squared loss.
+  """Computes and returns the per-example squared loss, divided by 2.

  Computes the per-example squared difference between the target and
  predicted tensors. The tensors must have the same shape.
@ -200,27 +182,33 @@ def squared_loss(predicted, target, name=None):
  with ops.op_scope([predicted, target], name, "squared_loss") as scope:
    predicted = ops.convert_to_tensor(predicted, name="predicted")
    target = ops.convert_to_tensor(target, name="target")
-    predicted.get_shape().assert_is_compatible_with(target.get_shape())
-    return math_ops.square(target - predicted, name=scope)
+    _validate_predicted_and_target(predicted, target)
+    return math_ops.div(math_ops.square(target - predicted), 2.0, name=scope)


-def sum_squared_loss(predicted, target, name=None):
-  """Calculates 1/2 the sum of the squared loss across batches.
+def logistic_loss(logit, target, name=None):
+  """Calculates the logistic cross-entropy loss.

-  Computes the squared difference between the target and predicted
-  tensors, sums across all dimensions except dimension 0, and divides
-  by 2:
+  **WARNING:** `logit` must be unscaled, while the `target` should be a
+  normalized probability prediction. See
+  `tf.nn.sigmoid_cross_entropy_with_logits` for more details.

-      losses = reduce_batch_sum(squared_loss(predicted, target)) / 2.0
+  Args:
+    logit: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
+      of predicted logit values.
+    target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of
+      target values. The shape of the target tensor should match the
+      `predicted` tensor.
+    name: A name for the operation (optional).

-  where `losses` is a tensor with dimensions [batch_size].
+  Returns:
+    A `Tensor` of the logistic cross-entropy loss.
+  """
+  return nn.sigmoid_cross_entropy_with_logits(logit, target, name=name)

-  The tensors must have the same shape.

-  This function is equivalent to typical formulations of L2 loss, and
-  similar to TensorFlow's l2_loss function. It differs from the
-  l2_loss function by allowing the caller to specify both the
-  predicted and target tensors.
+def _sum_loss(predicted, target, loss_fn, name="sum_loss"):
+  """Apply loss function, then sum across all non-batch dimensions.

  Args:
    predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
@ -228,30 +216,23 @@ def sum_squared_loss(predicted, target, name=None):
    target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of
      target values. The shape of the target tensor should match the
      `predicted` tensor.
+    loss_fn: Loss to apply, takes 2 tensors as parameters and returns a tensor.
    name: A name for the operation (optional).

  Returns:
-    A `[batch_size]` tensor of squared losses summed across all dimensions
-    except dimension 0, divided by 2.
-
-  Raises:
-    ValueError: If `predicted` and `target` shapes do not match.
-
+    A `[batch_size]` tensor of losses, averaged across all dimensions except
+    dimension 0.
  """
-  with ops.op_scope([predicted, target], name, "sum_squared_loss") as scope:
-    return math_ops.div(
-        reduce_batch_sum(squared_loss(predicted, target)),
-        2.0,
-        name=scope)
+  return reduce_batch_sum(loss_fn(predicted, target), name=name)


-def mean_absolute_loss(predicted, target, name=None):
-  """Calculates the mean absolute loss across batches.
+def sum_absolute_loss(predicted, target, name="sum_absolute_loss"):
+  """Calculates the sum of absolute losses across batches.

  Computes the absolute difference between the target and predicted
  tensors, averaged across all dimensions except dimension 0:

-        losses = reduce_batch_mean(absolute_loss(predicted, target))
+        losses = reduce_batch_sum(absolute_loss(predicted, target))

  where `losses` is a tensor with dimensions [batch_size].

@ -275,22 +256,26 @@ def mean_absolute_loss(predicted, target, name=None):
    ValueError: If `predicted` and `target` shapes do not match.

  """
-  with ops.op_scope([predicted, target], name, "mean_absolute_loss") as scope:
-    return reduce_batch_mean(absolute_loss(predicted, target), name=scope)
+  return _sum_loss(predicted, target, absolute_loss, name=name)


-def mean_squared_loss(predicted, target, name=None):
-  """Calculates the mean squared loss across batches.
+def sum_squared_loss(predicted, target, name="sum_squared_loss"):
+  """Calculates the sum of the squared loss across batches.

  Computes the squared difference between the target and predicted
-  tensors, and averages across all dimensions except dimension 0:
+  tensors, sums across all dimensions except dimension 0.

-        losses = reduce_batch_mean(squared_loss(predicted, target))
+      losses = reduce_batch_sum(squared_loss(predicted, target))

  where `losses` is a tensor with dimensions [batch_size].

  The tensors must have the same shape.

+  This function is equivalent to typical formulations of L2 loss, and
+  similar to TensorFlow's l2_loss function. It differs from the
+  l2_loss function by allowing the caller to specify both the
+  predicted and target tensors.
+
  Args:
    predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
      of predicted values.
@ -300,29 +285,63 @@ def mean_squared_loss(predicted, target, name=None):
    name: A name for the operation (optional).

  Returns:
-    A `[batch_size]` tensor of squared differences, averaged across
-    all dimensions except dimension 0.
+    A `[batch_size]` tensor of squared losses summed across all dimensions
+    except dimension 0.

  Raises:
    ValueError: If `predicted` and `target` shapes do not match.

  """
-  with ops.op_scope([predicted, target], name, "mean_squared_loss") as scope:
-    return reduce_batch_mean(squared_loss(predicted, target), name=scope)
+  return _sum_loss(predicted, target, squared_loss, name=name)


-def root_mean_squared_loss(predicted, target, name=None):
-  """Calculates the root mean squared loss across batches.
+def sum_logistic_loss(logit, target, name="sum_logistic_loss"):
+  """Calculates the sum of the logistic loss across batches.

-  Computes the root mean squared loss between the target and predicted
-  tensors, which is the square root of the mean squared differences
-  between the predicted and target tensors:
+  Computes the logistic between logit and predicted tensors, summed across all
+  dimensions except dimension 0.

-        losses = sqrt(mean_squared_loss(predicted, target))
+  **WARNING:** `logit` must be unscaled, while the `target` should be a
+  normalized probability prediction. See
+  `tf.nn.sigmoid_cross_entropy_with_logits` for more details.

-  where `losses` is a tensor with dimensions [batch_size].
+  Args:
+    logit: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
+      of predicted logit values.
+    target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of
+      target values. The shape of the target tensor should match the
+      `predicted` tensor.
+    name: A name for the operation (optional).

-  The tensors must have the same shape.
+  Returns:
+    A `[batch_size]` tensor of logistic losses summed across all dimensions
+    except dimension 0.
+  """
+  return _sum_loss(logit, target, logistic_loss, name=name)
+
+
+def _scalar_loss(predicted, target, loss_fn, name=None):
+  """Reduces losses to a scalar.
+
+  Args:
+    predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
+      of predicted values.
+    target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of
+      target values. The shape of the target tensor should match the
+      `predicted` tensor.
+    loss_fn: Loss to apply, takes 2 tensors as parameters and returns a tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    Caculate sum of losses per example, then average across batch.
+  """
+  with ops.op_scope([predicted, target], name, "scalar_loss") as scope:
+    return math_ops.reduce_mean(
+        _sum_loss(predicted, target, loss_fn), name=scope)
+
+
+def scalar_absolute_loss(predicted, target, name="scalar_absolute_loss"):
+  """Reduces absolute losses to a scalar.

  Args:
    predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
@ -333,20 +352,29 @@ def root_mean_squared_loss(predicted, target, name=None):
    name: A name for the operation (optional).

  Returns:
-    A `[batch_size]` tensor of the root mean squared differences.
-
-  Raises:
-    ValueError: If `predicted` and `target` shapes do not match.
-
+    Caculate sum of absolute losses per example, then average across batch.
  """
-  with ops.op_scope([predicted, target],
-                    name,
-                    "root_mean_squared_loss") as scope:
-    return math_ops.sqrt(mean_squared_loss(predicted, target),
-                         name=scope)
+  return _scalar_loss(predicted, target, loss_fn=absolute_loss, name=name)


-def scalar_logistic_loss(logit, target, name=None):
+def scalar_squared_loss(predicted, target, name="scalar_squared_loss"):
+  """Reduces squared losses to a scalar.
+
+  Args:
+    predicted: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]`
+      of predicted values.
+    target: A `Tensor` of shape `[batch_size, dim_1, ..., dim_n]` of
+      target values. The shape of the target tensor should match the
+      `predicted` tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    Caculate sum of squared losses per example, then average across batch.
+  """
+  return _scalar_loss(predicted, target, loss_fn=squared_loss, name=name)
+
+
+def scalar_logistic_loss(logit, target, name="scalar_logistic_loss"):
  """Calculates the logistic cross-entropy loss, averaged across batches.

  **WARNING:** `logit` must be unscaled, while the `target` should be a
@ -368,8 +396,5 @@ def scalar_logistic_loss(logit, target, name=None):
  Raises:
    ValueError: If `logit` and `target` shapes do not match.
  """
-  with ops.op_scope([logit, target], name,
-                    "scalar_logistic_loss") as scope:
-    batch_loss = reduce_batch_sum(nn.sigmoid_cross_entropy_with_logits(logit,
-                                                                       target))
-    return math_ops.reduce_mean(batch_loss, [0], name=scope)
+  return _scalar_loss(logit, target, loss_fn=logistic_loss, name=name)
+
--- a/tensorflow/contrib/layers/python/ops/loss_ops_test.py
+++ b/tensorflow/contrib/layers/python/ops/loss_ops_test.py
@ -21,6 +21,10 @@ from __future__ import print_function

 import numpy as np
 import tensorflow as tf
+from tensorflow.contrib.layers.python.framework import tensor_util
+
+pi = 3.14
+indiana_pi = 3.2  # https://en.wikipedia.org/wiki/Indiana_Pi_Bill


 class ReduceBatchSumTest(tf.test.TestCase):
@ -89,72 +93,6 @@ class ReduceBatchSumTest(tf.test.TestCase):
      self.assertAllClose(expected_result, actual_result.eval())


-class ReduceBatchMeanTest(tf.test.TestCase):
-
-  def testDimensionNone(self):
-    with self.test_session():
-      input_array = np.array([
-          [1.0, 2.0],
-          [-1.0, -2.0]
-      ], dtype=np.float32)
-      placeholder_vec = tf.placeholder(tf.float32, name="placeholder_vec")
-      expected_result = np.array([1.5, -1.5])
-      actual_result = tf.contrib.layers.reduce_batch_mean(placeholder_vec)
-      self.assertEqual(actual_result.get_shape().as_list(), [None])
-      self.assertAllClose(expected_result, actual_result.eval(feed_dict={
-          placeholder_vec: input_array
-      }))
-
-  def testDimension0(self):
-    with self.test_session():
-      input_vec = tf.constant(2.0)
-      with self.assertRaises(ValueError):
-        tf.contrib.layers.reduce_batch_mean(input_vec)
-
-  def testDimension1(self):
-    with self.test_session():
-      input_vec = tf.constant([1.0, 2.0])
-      expected_result = np.array([1.0, 2.0])
-      actual_result = tf.contrib.layers.reduce_batch_mean(input_vec)
-      self.assertAllClose(expected_result, actual_result.eval())
-
-  def testDimension2(self):
-    with self.test_session():
-      input_vec = tf.constant([
-          [1.0, 2.0],
-          [-1.0, -2.0]
-      ])
-      expected_result = np.array([1.5, -1.5])
-      actual_result = tf.contrib.layers.reduce_batch_mean(input_vec)
-      self.assertAllClose(expected_result, actual_result.eval())
-
-  def testReturnShape(self):
-    with self.test_session():
-      input_vec = tf.constant([
-          [1.0, 2.0],
-          [-1.0, -2.0]
-      ])
-      expected_result = np.array([3.0, -3.0])
-      actual_result = tf.contrib.layers.reduce_batch_mean(input_vec)
-      self.assertShapeEqual(expected_result, actual_result)
-
-  def testDimensionN(self):
-    with self.test_session():
-      input_vec = tf.constant([
-          [
-              [1.0, 2.0],
-              [3.0, 4.0]
-          ],
-          [
-              [5.0, 6.0],
-              [7.0, 8.0]
-          ]
-      ])
-      expected_result = np.array([2.5, 6.5])
-      actual_result = tf.contrib.layers.reduce_batch_mean(input_vec)
-      self.assertAllClose(expected_result, actual_result.eval())
-
-
 class AbsoluteLossTest(tf.test.TestCase):

  def _getTestVectors(self):
@ -191,7 +129,7 @@ class SquaredLossTest(tf.test.TestCase):
    target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target")
    predicted = tf.constant([1.1, -0.2, 3.3, 1.6], shape=[2, 2],
                            name="predicted")
-    expected_loss = np.array([0.01, 0.04, 0.09, 0.16]).reshape(2, 2)
+    expected_loss = np.array([0.005, 0.02, 0.045, 0.08]).reshape(2, 2)
    return target, predicted, expected_loss

  def testSquaredLoss(self):
@ -250,114 +188,108 @@ class SumSquaredLossTest(tf.test.TestCase):
        tf.contrib.layers.sum_squared_loss(incompatible_shape, target)


-class MeanAbsoluteLossTest(tf.test.TestCase):
+class ScalarAbsoluteLossTest(tf.test.TestCase):

-  def _getTestVectors(self):
-    target = tf.constant([[0.0, 1.0, 2.0],
-                          [3.0, 2.0, 4.0]],
-                         shape=[2, 3],
-                         name="target")
-    predicted = tf.constant([[3.0, -3.0, 0.0],
-                             [1.0, 2.0, 0.0]],
-                            shape=[2, 3],
-                            name="predicted")
-    expected_loss = np.array([3.0, 2.0])
-    return target, predicted, expected_loss
-
-  def testMeanAbsoluteLoss(self):
+  def testScalarAbsoluteLoss(self):
    with self.test_session():
-      target, predicted, expected_loss = self._getTestVectors()
-      result = tf.contrib.layers.mean_absolute_loss(predicted, target)
-      self.assertAllClose(expected_loss, result.eval())
+      actual = tf.constant([pi], name="pi")
+      actual_placeholder = tf.placeholder(tf.float32)
+      label = tf.constant([indiana_pi], name="lbl")
+      label_placeholder = tf.placeholder(tf.float32, name="lbl_ph")
+      expected_loss = abs(indiana_pi - pi)

-  def testMeanAbsoluteLossReturnShape(self):
+      # Both shapes are set.
+      both_shapes_loss = tf.contrib.layers.scalar_absolute_loss(actual, label)
+      tf.initialize_all_variables().run()
+      np.testing.assert_almost_equal(
+          both_shapes_loss.eval(), expected_loss, decimal=6)
+
+      # No shape for 'actual' - check that the loss layer can be created.
+      no_actual_shape_loss = tf.contrib.layers.scalar_absolute_loss(
+          actual_placeholder, label)
+      tf.initialize_all_variables().run()
+      np.testing.assert_almost_equal(
+          no_actual_shape_loss.eval({actual_placeholder: [pi]}),
+          expected_loss, decimal=6)
+
+      # No shape for 'label' - check that the loss layer can be created.
+      no_label_shape_loss = tf.contrib.layers.scalar_absolute_loss(
+          actual, label_placeholder)
+      tf.initialize_all_variables().run()
+      np.testing.assert_almost_equal(
+          no_label_shape_loss.eval({label_placeholder: [indiana_pi]}),
+          expected_loss, decimal=6)
+
+      # No shapes.
+      no_shape_loss = tf.contrib.layers.scalar_absolute_loss(
+          actual_placeholder, label_placeholder)
+      tf.initialize_all_variables().run()
+      np.testing.assert_almost_equal(
+          no_shape_loss.eval({label_placeholder: [indiana_pi],
+                              actual_placeholder: [pi]}),
+          expected_loss, decimal=6)
+
+      # Evaluate the previous one again, but this time with different
+      # (matching) shapes.  This should still work.
+      np.testing.assert_almost_equal(
+          no_shape_loss.eval({label_placeholder: [indiana_pi, indiana_pi],
+                              actual_placeholder: [pi, pi]}),
+          expected_loss, decimal=6)
+
+
+class ScalarSquaredLossTest(tf.test.TestCase):
+
+  def testScalarSquaredLoss(self):
    with self.test_session():
-      target, predicted, expected_loss = self._getTestVectors()
-      result = tf.contrib.layers.mean_absolute_loss(predicted, target)
-      self.assertShapeEqual(expected_loss, result)
+      actual = tf.constant([pi], name="pi")
+      actual_placeholder = tf.placeholder(tf.float32)
+      label = tf.constant([indiana_pi], name="lbl")
+      label_placeholder = tf.placeholder(tf.float32, name="lbl_ph")
+      expected_loss = (indiana_pi - pi) * (indiana_pi - pi) / 2

-  def testInvalidShapesValueError(self):
-    with self.test_session():
-      target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target")
-      incompatible_shape = tf.constant([0.0, 1.1], shape=[2],
-                                       name="incompatible_shape")
-      with self.assertRaises(ValueError):
-        tf.contrib.layers.mean_absolute_loss(incompatible_shape, target)
+      # Both shapes are set.
+      both_shapes_loss = tf.contrib.layers.scalar_squared_loss(actual, label)
+      tf.initialize_all_variables().run()
+      np.testing.assert_almost_equal(
+          both_shapes_loss.eval(), expected_loss, decimal=6)
+
+      # No shape for 'actual' - check that the loss layer can be created.
+      no_actual_shape_loss = tf.contrib.layers.scalar_squared_loss(
+          actual_placeholder, label)
+      tf.initialize_all_variables().run()
+      np.testing.assert_almost_equal(
+          no_actual_shape_loss.eval({actual_placeholder: [pi]}),
+          expected_loss, decimal=6)
+
+      # No shape for 'label' - check that the loss layer can be created.
+      no_label_shape_loss = tf.contrib.layers.scalar_squared_loss(
+          actual, label_placeholder)
+      tf.initialize_all_variables().run()
+      np.testing.assert_almost_equal(
+          no_label_shape_loss.eval({label_placeholder: [indiana_pi]}),
+          expected_loss,
+          decimal=6)
+
+      # No shapes.
+      no_shape_loss = tf.contrib.layers.scalar_squared_loss(
+          actual_placeholder, label_placeholder)
+      tf.initialize_all_variables().run()
+      np.testing.assert_almost_equal(
+          no_shape_loss.eval({label_placeholder: [indiana_pi],
+                              actual_placeholder: [pi]}),
+          expected_loss, decimal=6)
+
+      # Evaluate the previous one again, but this time with different
+      # (matching) shapes. This should still work.
+      np.testing.assert_almost_equal(
+          no_shape_loss.eval({label_placeholder: [indiana_pi, indiana_pi],
+                              actual_placeholder: [pi, pi]}),
+          expected_loss, decimal=6)


-class MeanSquaredLossTest(tf.test.TestCase):
+class ScalarLogisticLossTest(tf.test.TestCase):

-  def _getTestVectors(self):
-    target = tf.constant([[0.0, 1.0, 2.0],
-                          [3.0, 2.0, 4.0]],
-                         shape=[2, 3],
-                         name="target")
-    predicted = tf.constant([[3.0, -3.0, 0.0],
-                             [1.0, 2.0, 0.0]],
-                            shape=[2, 3],
-                            name="predicted")
-    expected_loss = np.array([9.666667, 6.666667])
-    return target, predicted, expected_loss
-
-  def testMeanSquaredLoss(self):
-    with self.test_session():
-      target, predicted, expected_loss = self._getTestVectors()
-      result = tf.contrib.layers.mean_squared_loss(predicted, target)
-      self.assertAllClose(expected_loss, result.eval())
-
-  def testMeanSquaredLossReturnShape(self):
-    with self.test_session():
-      target, predicted, expected_loss = self._getTestVectors()
-      result = tf.contrib.layers.mean_squared_loss(predicted, target)
-      self.assertShapeEqual(expected_loss, result)
-
-  def testInvalidShapesValueError(self):
-    with self.test_session():
-      target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target")
-      incompatible_shape = tf.constant([0.0, 1.1], shape=[2],
-                                       name="incompatible_shape")
-      with self.assertRaises(ValueError):
-        tf.contrib.layers.mean_squared_loss(incompatible_shape, target)
-
-
-class RootMeanSquaredLossTest(tf.test.TestCase):
-
-  def _getTestVectors(self):
-    target = tf.constant([[0.0, 1.0, 2.0],
-                          [3.0, 2.0, 4.0]],
-                         shape=[2, 3],
-                         name="target")
-    predicted = tf.constant([[3.0, -3.0, 0.0],
-                             [1.0, 2.0, 0.0]],
-                            shape=[2, 3],
-                            name="predicted")
-    expected_loss = np.array([3.109126, 2.5819889])
-    return target, predicted, expected_loss
-
-  def testRootMeanSquaredLoss(self):
-    with self.test_session():
-      target, predicted, expected_loss = self._getTestVectors()
-      result = tf.contrib.layers.root_mean_squared_loss(predicted, target)
-      self.assertAllClose(expected_loss, result.eval())
-
-  def testRootMeanSquaredLossReturnShape(self):
-    with self.test_session():
-      target, predicted, expected_loss = self._getTestVectors()
-      result = tf.contrib.layers.root_mean_squared_loss(predicted, target)
-      self.assertShapeEqual(expected_loss, result)
-
-  def testInvalidShapesValueError(self):
-    with self.test_session():
-      target = tf.constant([1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="target")
-      incompatible_shape = tf.constant([0.0, 1.1], shape=[2],
-                                       name="incompatible_shape")
-      with self.assertRaises(ValueError):
-        tf.contrib.layers.root_mean_squared_loss(incompatible_shape, target)
-
-
-class MeanScalarLogisticLossTest(tf.test.TestCase):
-
-  def _get_mean_sigmoid_logistic_loss(self, logit, target):
+  def _expected_loss(self, logit, target):
    sigmoid = 1.0 / (1.0 + np.exp(-logit))
    logistic_loss = (target * -np.log(sigmoid)) - (
        (1.0 - target) * np.log(1.0 - sigmoid))
@ -365,14 +297,13 @@ class MeanScalarLogisticLossTest(tf.test.TestCase):

    return np.sum(batch_losses) / len(batch_losses)

-  def test_mean__scalar_logistic_loss(self):
+  def test_scalar_logistic_loss(self):
    logit = np.array([[9.45, -42], [4.2, 1], [-0.6, 20]])
    target = np.array([[0.8, 0.9], [0.45, 0.99999], [0.1, 0.0006]])
-    expected_loss = self._get_mean_sigmoid_logistic_loss(logit, target)
    with self.test_session():
      result = tf.contrib.layers.scalar_logistic_loss(
          tf.constant(logit), tf.constant(target))
-      self.assertAllClose(expected_loss, result.eval())
+      self.assertAllClose(self._expected_loss(logit, target), result.eval())


 if __name__ == "__main__":
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@ -36,6 +36,7 @@ py_test(
    name = "sdca_ops_test",
    srcs = ["python/kernel_tests/sdca_ops_test.py"],
    srcs_version = "PY2AND3",
+    tags = ["noasan"],  # doesn't pass ASAN for some reason
    deps = [
        ":sdca_ops_py",
        "//tensorflow:tensorflow_py",
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@ -112,12 +112,13 @@ def make_dense_variable_dict(num_dense_features, num_examples):
 def get_binary_predictions_for_logistic(predictions, cutoff=0.5):
  return tf.cast(
      tf.greater_equal(predictions, tf.ones_like(predictions) * cutoff),
-      tf.float32)
+      dtype=tf.float32)


 def get_binary_predictions_for_hinge(predictions):
-  all_ones = tf.ones_like(predictions)
-  return tf.add(tf.sign(predictions), all_ones) / 2
+  return tf.cast(
+      tf.greater_equal(predictions, tf.zeros_like(predictions)),
+      dtype=tf.float32)


 # Setup the single container shared across all tests. This is testing proper
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@ -28,9 +28,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework.load_library import load_op_library
 from tensorflow.python.framework.ops import convert_to_tensor
 from tensorflow.python.framework.ops import name_scope
+from tensorflow.python.framework.ops import op_scope
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as var_ops
 from tensorflow.python.ops.nn import sigmoid_cross_entropy_with_logits
 from tensorflow.python.platform import resource_loader
@ -55,6 +57,7 @@ def _maybe_load_sdca_ops():
      assert _sdca_ops, 'Could not load _sdca_ops.so'


+# TODO(rohananil): add op_scope to appropriate methods.
 class SdcaModel(object):
  """Stochastic dual coordinate ascent solver for linear models.

@ -255,13 +258,20 @@ class SdcaModel(object):
        predictions = math_ops.sigmoid(predictions)
    return predictions

-  def minimize(self):
+  def minimize(self, global_step=None, name=None):
    """Add operations to train a linear model by minimizing the loss function.

+    Args:
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.
+
    Returns:
      An Operation that updates the variables passed in the constructor.
    """
-    with name_scope('sdca/minimize'):
+    # Technically, the op depends on a lot more than the variables,
+    # but we'll keep the list short.
+    with op_scope([], name, 'sdca/minimize'):
      sparse_features_indices = []
      sparse_features_values = []
      for sf in self._examples['sparse_features']:
@ -301,7 +311,7 @@ class SdcaModel(object):
            assign_ops.append(var.assign(slot_var))
        assign_group = control_flow_ops.group(*assign_ops)
        with ops.control_dependencies([assign_group]):
-          return _sdca_ops.sdca_shrink_l1(
+          shrink_l1 = _sdca_ops.sdca_shrink_l1(
              self._convert_n_to_tensor(
                  self._variables['sparse_features_weights'],
                  as_ref=True),
@ -310,6 +320,11 @@ class SdcaModel(object):
                  as_ref=True),
              l1=self._options['symmetric_l1_regularization'],
              l2=self._symmetric_l2_regularization())
+      if not global_step:
+        return shrink_l1
+      with ops.control_dependencies([shrink_l1]):
+        with ops.colocate_with(global_step):
+          return state_ops.assign_add(global_step, 1, name=name).op

  def approximate_duality_gap(self):
    """Add operations to compute the approximate duality gap.
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -968,7 +968,6 @@ tf_cuda_library(
 tf_cuda_library(
    name = "gpu_runtime",
    srcs = [
-        "common_runtime/gpu/gpu_allocator_retry.cc",
        "common_runtime/gpu/gpu_bfc_allocator.cc",
        "common_runtime/gpu/gpu_debug_allocator.cc",
        "common_runtime/gpu/gpu_device.cc",
@ -982,7 +981,6 @@ tf_cuda_library(
        "common_runtime/gpu_device_context.h",
    ],
    hdrs = [
-        "common_runtime/gpu/gpu_allocator_retry.h",
        "common_runtime/gpu/gpu_bfc_allocator.h",
        "common_runtime/gpu/gpu_debug_allocator.h",
        "common_runtime/gpu/gpu_device.h",
@ -991,7 +989,6 @@ tf_cuda_library(
        "common_runtime/gpu/gpu_util.h",
        "common_runtime/gpu/pool_allocator.h",
        "common_runtime/gpu/process_state.h",
-        "common_runtime/gpu/visitable_allocator.h",
    ],
    copts = tf_copts(),
    linkstatic = 1,
--- a/tensorflow/core/client/tensor_c_api.cc
+++ b/tensorflow/core/client/tensor_c_api.cc
@ -420,18 +420,26 @@ void TF_Run_Helper(TF_Session* s, const char* handle,
                                            run_options->length)) {
        status->status =
            tensorflow::errors::InvalidArgument("Unparseable RunOptions proto");
+        return;
+      }
+      if (run_outputs != nullptr && run_outputs->data != nullptr) {
+        status->status = tensorflow::errors::InvalidArgument(
+            "Passing non-empty run_outputs is invalid.");
+        return;
      }
-      RunOutputs run_outputs_proto;

+      RunOutputs run_outputs_proto;
      result = s->session->Run(run_options_proto, inputs, output_tensor_names,
                               target_node_names, &outputs, &run_outputs_proto);

      // Serialize back to upstream client, who now owns the new buffer
-      int proto_size = run_outputs_proto.ByteSize();
-      void* str_buf = reinterpret_cast<void*>(operator new(proto_size));
-      run_outputs_proto.SerializeToArray(str_buf, proto_size);
-      run_outputs->data = str_buf;
-      run_outputs->length = proto_size;
+      if (run_outputs != nullptr) {
+        int proto_size = run_outputs_proto.ByteSize();
+        void* str_buf = reinterpret_cast<void*>(operator new(proto_size));
+        run_outputs_proto.SerializeToArray(str_buf, proto_size);
+        run_outputs->data = str_buf;
+        run_outputs->length = proto_size;
+      }
    }
  } else {
    // NOTE(zongheng): PRun does not support RunOptions yet.
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
+#include "tensorflow/core/common_runtime/allocator_retry.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@ -21,9 +21,9 @@ limitations under the License.

 namespace tensorflow {

-GPUAllocatorRetry::GPUAllocatorRetry() : env_(Env::Default()) {}
+AllocatorRetry::AllocatorRetry() : env_(Env::Default()) {}

-void* GPUAllocatorRetry::AllocateRaw(
+void* AllocatorRetry::AllocateRaw(
    std::function<void*(size_t alignment, size_t num_bytes,
                        bool verbose_failure)>
        alloc_func,
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_

 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@ -23,9 +23,9 @@ limitations under the License.
 namespace tensorflow {

 // A retrying wrapper for a memory allocator.
-class GPUAllocatorRetry {
+class AllocatorRetry {
 public:
-  GPUAllocatorRetry();
+  AllocatorRetry();

  // Call 'alloc_func' to obtain memory.  On first call,
  // 'verbose_failure' will be false.  If return value is nullptr,
@ -50,11 +50,11 @@ class GPUAllocatorRetry {
 };

 // Implementation details below
-inline void GPUAllocatorRetry::NotifyDealloc() {
+inline void AllocatorRetry::NotifyDealloc() {
  mutex_lock l(mu_);
  memory_returned_.notify_all();
 }

 }  // namespace tensorflow

-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ALLOCATOR_RETRY_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_ALLOCATOR_RETRY_H_
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@ -0,0 +1,702 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+
+#include "tensorflow/core/common_runtime/allocator_retry.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
+                           bool allow_growth, const string& name)
+    : suballocator_(sub_allocator),
+      name_(name),
+      free_chunks_list_(kInvalidChunkHandle),
+      next_allocation_id_(1) {
+  if (allow_growth) {
+    // 1MiB smallest initial allocation, unless total memory available
+    // is less.
+    curr_region_allocation_bytes_ =
+        RoundedBytes(std::min(total_memory, size_t{1048576}));
+  } else {
+    curr_region_allocation_bytes_ = RoundedBytes(total_memory);
+  }
+
+  // Allocate the requested amount of memory.
+  memory_limit_ = total_memory;
+  stats_.bytes_limit = static_cast<int64>(total_memory);
+
+  // Create a bunch of bins of various good sizes.
+
+  // We create bins to fit all possible ranges that cover the
+  // memory_limit_ starting from allocations up to 256 bytes to
+  // allocations up to (and including) the memory limit.
+  for (BinNum b = 0; b < kNumBins; b++) {
+    size_t bin_size = BinNumToSize(b);
+    VLOG(1) << "Creating bin of max chunk size "
+            << strings::HumanReadableNumBytes(bin_size);
+    new (BinFromIndex(b)) Bin(this, bin_size);
+    CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
+    CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b));
+    CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b));
+    if (b + 1 < kNumBins) {
+      CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b));
+    }
+  }
+}
+
+BFCAllocator::~BFCAllocator() {
+  // Return memory back.
+  VLOG(2) << "Number of regions allocated: "
+          << region_manager_.regions().size();
+  for (const auto& region : region_manager_.regions()) {
+    suballocator_->Free(region.ptr(), region.memory_size());
+  }
+
+  for (BinNum b = 0; b < kNumBins; b++) {
+    BinFromIndex(b)->~Bin();
+  }
+}
+
+BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
+  DCHECK_GE(h, 0);
+  DCHECK_LT(h, static_cast<int>(chunks_.size()));
+  return &(chunks_[h]);
+}
+
+bool BFCAllocator::Extend(size_t rounded_bytes) {
+  // Do we have enough space to handle the client's request?
+  // If not, fail immediately.
+  if (total_region_allocated_bytes_ + rounded_bytes > memory_limit_) {
+    return false;
+  }
+
+  // If curr_region_allocation_bytes_ is not enough to satisfy the
+  // allocation, keep multiplying by a power of two until that is
+  // sufficient.
+  bool increased_allocation = false;
+  while (rounded_bytes > curr_region_allocation_bytes_) {
+    curr_region_allocation_bytes_ *= 2;
+    increased_allocation = true;
+  }
+
+  // Try allocating.
+  size_t bytes = curr_region_allocation_bytes_;
+  void* mem_addr = suballocator_->Alloc(32, bytes);
+  if (mem_addr == nullptr && !started_backpedal_) {
+    // Only backpedal once.
+    started_backpedal_ = true;
+
+    static constexpr float kBackpedalFactor = 0.9;
+
+    // Try allocating less memory.
+    bytes = RoundedBytes(bytes * kBackpedalFactor);
+    while (mem_addr == nullptr && bytes > rounded_bytes) {
+      mem_addr = suballocator_->Alloc(32, bytes);
+      bytes = RoundedBytes(bytes * kBackpedalFactor);
+    }
+  }
+
+  if (mem_addr == nullptr) {
+    return false;
+  }
+
+  if (!increased_allocation) {
+    // Increase the region size of the next required allocation.
+    curr_region_allocation_bytes_ *= 2;
+  }
+
+  VLOG(1) << "Extending allocation by " << strings::HumanReadableNumBytes(bytes)
+          << " bytes.";
+
+  total_region_allocated_bytes_ += bytes;
+  VLOG(1) << "Total allocated bytes: "
+          << strings::HumanReadableNumBytes(total_region_allocated_bytes_);
+
+  VLOG(1) << "Allocated memory at " << mem_addr << " to "
+          << static_cast<void*>(static_cast<char*>(mem_addr) + bytes);
+  region_manager_.AddAllocationRegion(mem_addr, bytes);
+
+  // Create one large chunk for the whole memory space that will
+  // be chunked later.
+  ChunkHandle h = AllocateChunk();
+  BFCAllocator::Chunk* c = ChunkFromHandle(h);
+  c->ptr = mem_addr;
+  c->size = bytes;
+  c->allocation_id = -1;
+  c->prev = kInvalidChunkHandle;
+  c->next = kInvalidChunkHandle;
+
+  region_manager_.set_handle(c->ptr, h);
+
+  // TODO(vrv): Try to merge this new region with an existing region,
+  // if the address space is contiguous, to avoid fragmentation
+  // across regions.
+
+  // Insert the chunk into the right bin.
+  InsertFreeChunkIntoBin(h);
+
+  // Invoke visitors on newly allocated region.
+  for (auto visitor : region_visitors_) {
+    visitor(mem_addr, bytes);
+  }
+  return true;
+}
+
+BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() {
+  if (free_chunks_list_ != kInvalidChunkHandle) {
+    ChunkHandle h = free_chunks_list_;
+    Chunk* c = ChunkFromHandle(h);
+    free_chunks_list_ = c->next;
+    return h;
+  } else {
+    ChunkHandle h = chunks_.size();
+    chunks_.resize(h + 1);
+    return h;
+  }
+}
+
+void BFCAllocator::DeallocateChunk(ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+  c->next = free_chunks_list_;
+  free_chunks_list_ = h;
+}
+
+void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) {
+  // Fast path: Try once to allocate without getting the retry_helper_ involved
+  void* r = AllocateRawInternal(unused_alignment, num_bytes, false);
+  if (r != nullptr) {
+    return r;
+  } else {
+    static const int64 kMaxMillisToWait = 10000;  // 10 seconds
+    return retry_helper_.AllocateRaw(
+        [this](size_t a, size_t nb, bool v) {
+          return AllocateRawInternal(a, nb, v);
+        },
+        kMaxMillisToWait, unused_alignment, num_bytes);
+  }
+}
+
+void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
+                                const AllocationAttributes& allocation_attr) {
+  if (allocation_attr.no_retry_on_failure) {
+    // Return immediately upon the first failure if this is for allocating an
+    // optional scratch space.
+    void* result = AllocateRawInternal(unused_alignment, num_bytes, false);
+    if (result == nullptr) {
+      // The counter incrementing is not thread-safe. But we don't really care.
+      // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N for
+      // more general usage.
+      static int log_counter = 0;
+      if (log_counter < 10) {
+        log_counter++;
+        LOG(WARNING)
+            << "Ran out of memory trying to allocate "
+            << strings::HumanReadableNumBytes(num_bytes)
+            << ". The caller indicates that this is not a failure, but"
+            << " may mean that there could be performance gains if more"
+            << " memory is available.";
+      }
+    }
+    return result;
+  } else {
+    return AllocateRaw(unused_alignment, num_bytes);
+  }
+}
+
+// static
+size_t BFCAllocator::RoundedBytes(size_t bytes) {
+  size_t rounded_bytes =
+      (kMinAllocationSize *
+       ((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
+  DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize);
+  return rounded_bytes;
+}
+
+void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
+                                        size_t num_bytes,
+                                        bool dump_log_on_failure) {
+  if (num_bytes == 0) {
+    LOG(ERROR) << "tried to allocate 0 bytes";
+    return nullptr;
+  }
+  // First, always allocate memory of at least kMinAllocationSize
+  // bytes, and always allocate multiples of kMinAllocationSize bytes
+  // so all memory addresses are nicely byte aligned.
+  size_t rounded_bytes = RoundedBytes(num_bytes);
+
+  // The BFC allocator tries to find the best fit first.
+  BinNum bin_num = BinNumForSize(rounded_bytes);
+
+  mutex_lock l(lock_);
+  void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
+  if (ptr != nullptr) {
+    return ptr;
+  }
+
+  // Try to extend
+  if (Extend(rounded_bytes)) {
+    ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
+    if (ptr != nullptr) {
+      return ptr;
+    }
+  }
+
+  // We searched all bins for an existing free chunk to use and
+  // couldn't find one.  This means we must have run out of memory,
+  // Dump the memory log for analysis.
+  if (dump_log_on_failure) {
+    DumpMemoryLog(rounded_bytes);
+    LOG(WARNING) << RenderOccupancy();
+    LOG(WARNING) << "Ran out of memory trying to allocate "
+                 << strings::HumanReadableNumBytes(num_bytes)
+                 << ".  See logs for memory state.";
+  }
+  return nullptr;
+}
+
+void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
+                                 size_t num_bytes) {
+  // First identify the first bin that could satisfy rounded_bytes.
+  for (; bin_num < kNumBins; bin_num++) {
+    // Start searching from the first bin for the smallest chunk that fits
+    // rounded_bytes.
+    Bin* b = BinFromIndex(bin_num);
+    for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
+         ++citer) {
+      const BFCAllocator::ChunkHandle h = (*citer);
+      BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
+      DCHECK(!chunk->in_use());
+      if (chunk->size >= rounded_bytes) {
+        // We found an existing chunk that fits us that wasn't in use, so remove
+        // it from the free bin structure prior to using.
+        RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
+
+        // If we can break the size of the chunk into two reasonably
+        // large pieces, do so.
+        //
+        // TODO(vrv): What should be the criteria when deciding when
+        // to split?
+        if (chunk->size >= rounded_bytes * 2) {
+          SplitChunk(h, rounded_bytes);
+          chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
+        }
+
+        // The requested size of the returned chunk is what the user
+        // has allocated.
+        chunk->requested_size = num_bytes;
+        // Assign a unique id and increment the id counter, marking the
+        // chunk as being in use.
+        chunk->allocation_id = next_allocation_id_++;
+
+        // Update stats.
+        ++stats_.num_allocs;
+        stats_.bytes_in_use += chunk->size;
+        stats_.max_bytes_in_use =
+            std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
+        stats_.max_alloc_size =
+            std::max<std::size_t>(stats_.max_alloc_size, chunk->size);
+
+        VLOG(4) << "Returning: " << chunk->ptr;
+        if (VLOG_IS_ON(4)) {
+          LOG(INFO) << "A: " << RenderOccupancy();
+        }
+        return chunk->ptr;
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
+  // Allocate the new chunk before we do any ChunkFromHandle
+  ChunkHandle h_new_chunk = AllocateChunk();
+
+  Chunk* c = ChunkFromHandle(h);
+  CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
+
+  // Create a new chunk starting num_bytes after c
+  BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
+  new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
+  region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
+
+  // Set the new sizes of the chunks.
+  new_chunk->size = c->size - num_bytes;
+  c->size = num_bytes;
+
+  // The new chunk is not in use.
+  new_chunk->allocation_id = -1;
+
+  // Maintain the pointers.
+  // c <-> c_neighbor becomes
+  // c <-> new_chunk <-> c_neighbor
+  BFCAllocator::ChunkHandle h_neighbor = c->next;
+  new_chunk->prev = h;
+  new_chunk->next = h_neighbor;
+  c->next = h_new_chunk;
+  if (h_neighbor != kInvalidChunkHandle) {
+    Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
+    c_neighbor->prev = h_new_chunk;
+  }
+
+  // Add the newly free chunk to the free bin.
+  InsertFreeChunkIntoBin(h_new_chunk);
+}
+
+void BFCAllocator::DeallocateRaw(void* ptr) {
+  DeallocateRawInternal(ptr);
+  retry_helper_.NotifyDealloc();
+}
+
+void BFCAllocator::DeallocateRawInternal(void* ptr) {
+  if (ptr == nullptr) {
+    LOG(ERROR) << "tried to deallocate nullptr";
+    return;
+  }
+  mutex_lock l(lock_);
+
+  // Find the chunk from the ptr.
+  BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
+  CHECK(h != kInvalidChunkHandle);
+
+  // Consider coalescing it.
+  FreeAndMaybeCoalesce(h);
+
+  if (VLOG_IS_ON(4)) {
+    LOG(INFO) << "F: " << RenderOccupancy();
+  }
+}
+
+// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
+// We merge Chunk(h2) into Chunk(h1).
+void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
+                         BFCAllocator::ChunkHandle h2) {
+  Chunk* c1 = ChunkFromHandle(h1);
+  Chunk* c2 = ChunkFromHandle(h2);
+  // We can only merge chunks that are not in use.
+  CHECK(!c1->in_use() && !c2->in_use());
+
+  // c1's prev doesn't change, still points to the same ptr, and is
+  // still not in use.
+
+  // Fix up neighbor pointers
+  //
+  // c1 <-> c2 <-> c3 should become
+  // c1 <-> c3
+
+  BFCAllocator::ChunkHandle h3 = c2->next;
+  c1->next = h3;
+  CHECK(c2->prev == h1);
+  if (h3 != kInvalidChunkHandle) {
+    BFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
+    c3->prev = h1;
+  }
+
+  // Set the new size
+  c1->size += c2->size;
+
+  DeleteChunk(h2);
+}
+
+void BFCAllocator::DeleteChunk(ChunkHandle h) {
+  // Delete h and cleanup all state
+  Chunk* c = ChunkFromHandle(h);
+  //  VLOG(4) << "Removing: " << c->ptr;
+  region_manager_.erase(c->ptr);
+  DeallocateChunk(h);
+}
+
+void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+  CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
+  BinNum bin_num = BinNumForSize(c->size);
+  Bin* new_bin = BinFromIndex(bin_num);
+  c->bin_num = bin_num;
+  new_bin->free_chunks.insert(h);
+}
+
+void BFCAllocator::RemoveFreeChunkIterFromBin(
+    BFCAllocator::Bin::FreeChunkSet* free_chunks,
+    const BFCAllocator::Bin::FreeChunkSet::iterator& citer) {
+  ChunkHandle h = *citer;
+  Chunk* c = ChunkFromHandle(h);
+  CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
+  free_chunks->erase(citer);
+  c->bin_num = kInvalidBinNum;
+}
+
+void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+  CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
+  int count = BinFromIndex(c->bin_num)->free_chunks.erase(h);
+  CHECK(count > 0) << "Could not find chunk in bin";
+  c->bin_num = kInvalidBinNum;
+}
+
+void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) {
+  Chunk* c = ChunkFromHandle(h);
+  CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
+
+  // Mark the chunk as no longer in use
+  c->allocation_id = -1;
+
+  // Updates the stats.
+  stats_.bytes_in_use -= c->size;
+
+  // This chunk is no longer in-use, consider coalescing the chunk
+  // with adjacent chunks.
+  ChunkHandle chunk_to_reassign = h;
+
+  // If the next chunk is free, coalesce the two
+  if (c->next != kInvalidChunkHandle) {
+    Chunk* cnext = ChunkFromHandle(c->next);
+    if (!cnext->in_use()) {
+      //      VLOG(8) << "Chunk at " << cnext->ptr << " merging with c " <<
+      //      c->ptr;
+
+      chunk_to_reassign = h;
+
+      // Deletes c->next
+      RemoveFreeChunkFromBin(c->next);
+      Merge(h, ChunkFromHandle(h)->next);
+    }
+  }
+
+  // If the previous chunk is free, coalesce the two
+  c = ChunkFromHandle(h);
+  if (c->prev != kInvalidChunkHandle) {
+    Chunk* cprev = ChunkFromHandle(c->prev);
+    if (!cprev->in_use()) {
+      //      VLOG(8) << "Chunk at " << c->ptr << " merging into c->prev "
+      //       << cprev->ptr;
+
+      chunk_to_reassign = c->prev;
+
+      // Deletes c
+      RemoveFreeChunkFromBin(c->prev);
+      Merge(ChunkFromHandle(h)->prev, h);
+      c = ChunkFromHandle(h);
+    }
+  }
+
+  InsertFreeChunkIntoBin(chunk_to_reassign);
+}
+
+void BFCAllocator::AddAllocVisitor(Visitor visitor) {
+  VLOG(1) << "AddVisitor";
+  mutex_lock l(lock_);
+  region_visitors_.push_back(visitor);
+  for (const auto& region : region_manager_.regions()) {
+    visitor(region.ptr(), region.memory_size());
+  }
+}
+
+bool BFCAllocator::TracksAllocationSizes() { return true; }
+
+size_t BFCAllocator::RequestedSize(void* ptr) {
+  mutex_lock l(lock_);
+  BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
+  CHECK(h != kInvalidChunkHandle)
+      << "Asked for requested size of pointer we never allocated: " << ptr;
+  BFCAllocator::Chunk* c = ChunkFromHandle(h);
+  return c->requested_size;
+}
+
+size_t BFCAllocator::AllocatedSize(void* ptr) {
+  mutex_lock l(lock_);
+  BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
+  CHECK(h != kInvalidChunkHandle)
+      << "Asked for allocated size of pointer we never allocated: " << ptr;
+  BFCAllocator::Chunk* c = ChunkFromHandle(h);
+  return c->size;
+}
+
+int64 BFCAllocator::AllocationId(void* ptr) {
+  mutex_lock l(lock_);
+  BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
+  CHECK(h != kInvalidChunkHandle)
+      << "Asked for allocation id of pointer we never allocated: " << ptr;
+  BFCAllocator::Chunk* c = ChunkFromHandle(h);
+  return c->allocation_id;
+}
+
+namespace {
+
+void RenderRegion(char* rendered, const size_t resolution,
+                  const size_t total_render_size, const size_t offset,
+                  const void* base_ptr, const void* ptr, const size_t size,
+                  const char c) {
+  const char* base_ptr_c = static_cast<const char*>(base_ptr);
+  const char* ptr_c = static_cast<const char*>(ptr);
+
+  size_t start_location =
+      ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
+  CHECK_GE(start_location, 0);
+  CHECK_LT(start_location, resolution);
+  size_t end_location =
+      ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) /
+      total_render_size;
+  CHECK_GE(end_location, 0);
+  CHECK_LT(end_location, resolution);
+
+  for (size_t i = start_location; i <= end_location; ++i) {
+    rendered[i] = c;
+  }
+}
+
+}  // namespace
+
+string BFCAllocator::RenderOccupancy() {
+  // Make a buffer for the ASCII-art representation.
+  const size_t resolution = 100;
+  char rendered[resolution];
+
+  // Compute the total region size to render over
+  size_t total_region_size = 0;
+  for (const auto& region : region_manager_.regions()) {
+    total_region_size += region.memory_size();
+  }
+
+  // Start out with everything empty
+  RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
+               total_region_size, '_');
+
+  size_t region_offset = 0;
+  for (const auto& region : region_manager_.regions()) {
+    ChunkHandle h = region_manager_.get_handle(region.ptr());
+    // Then render each chunk left to right.
+    while (h != kInvalidChunkHandle) {
+      Chunk* c = ChunkFromHandle(h);
+      if (c->in_use()) {
+        // Render the wasted space
+        size_t wasted = c->size - c->requested_size;
+        if (wasted > 0) {
+          RenderRegion(rendered, resolution, total_region_size,
+                       region_offset + c->requested_size, region.ptr(), c->ptr,
+                       wasted, 'x');
+        }
+        // Then the occupied space
+        RenderRegion(rendered, resolution, total_region_size, region_offset,
+                     region.ptr(), c->ptr, c->requested_size, '*');
+      }
+      h = c->next;
+    }
+    region_offset += region.memory_size();
+  }
+
+  return StringPiece(rendered, resolution).ToString();
+}
+
+void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
+  // For each bin: tally up the total number of chunks and bytes.
+  // Note that bins hold only free chunks.
+  for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
+    Bin* b = BinFromIndex(bin_num);
+
+    size_t total_bytes_in_use = 0;
+    size_t total_bytes_in_bin = 0;
+    size_t total_requested_bytes_in_use = 0;
+    size_t total_requested_bytes_in_bin = 0;
+    size_t total_chunks_in_use = 0;
+    size_t total_chunks_in_bin = 0;
+    for (ChunkHandle h : b->free_chunks) {
+      Chunk* c = ChunkFromHandle(h);
+      total_bytes_in_bin += c->size;
+      total_requested_bytes_in_bin += c->requested_size;
+      ++total_chunks_in_bin;
+      if (c->in_use()) {
+        total_bytes_in_use += c->size;
+        total_requested_bytes_in_use += c->requested_size;
+        ++total_chunks_in_use;
+      }
+    }
+
+    LOG(INFO) << "Bin (" << b->bin_size
+              << "): \tTotal Chunks: " << total_chunks_in_bin
+              << ", Chunks in use: " << total_chunks_in_use << " "
+              << strings::HumanReadableNumBytes(total_bytes_in_bin)
+              << " allocated for chunks. "
+              << strings::HumanReadableNumBytes(total_requested_bytes_in_bin)
+              << " client-requested for chunks. "
+              << strings::HumanReadableNumBytes(total_bytes_in_use)
+              << " in use in bin. "
+              << strings::HumanReadableNumBytes(total_requested_bytes_in_use)
+              << " client-requested in use in bin.";
+  }
+
+  // Find the bin that we would have liked to allocate in, so we
+  // can get some further analysis about fragmentation.
+  Bin* b = BinForSize(num_bytes);
+
+  LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
+            << " was " << strings::HumanReadableNumBytes(b->bin_size)
+            << ", Chunk State: ";
+
+  for (ChunkHandle h : b->free_chunks) {
+    Chunk* c = ChunkFromHandle(h);
+    LOG(INFO) << c->DebugString(this, true);
+  }
+
+  // Next show the chunks that are in use, and also summarize their
+  // number by size.
+  std::map<size_t, int> in_use_by_size;
+  for (const auto& region : region_manager_.regions()) {
+    ChunkHandle h = region_manager_.get_handle(region.ptr());
+    while (h != kInvalidChunkHandle) {
+      const Chunk* c = ChunkFromHandle(h);
+      if (c->in_use()) {
+        in_use_by_size[c->size]++;
+        LOG(INFO) << "Chunk at " << c->ptr << " of size " << c->size;
+      }
+      h = c->next;
+    }
+
+    h = region_manager_.get_handle(region.ptr());
+    while (h != kInvalidChunkHandle) {
+      const Chunk* c = ChunkFromHandle(h);
+      if (!c->in_use()) {
+        LOG(INFO) << "Free at " << c->ptr << " of size " << c->size;
+      }
+      h = c->next;
+    }
+  }
+
+  LOG(INFO) << "     Summary of in-use Chunks by size: ";
+  size_t total_bytes = 0;
+  for (auto& it : in_use_by_size) {
+    LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
+              << strings::HumanReadableNumBytes(it.first * it.second);
+    total_bytes += (it.first * it.second);
+  }
+  LOG(INFO) << "Sum Total of in-use chunks: "
+            << strings::HumanReadableNumBytes(total_bytes);
+  LOG(INFO) << "Stats: \n" << stats_.DebugString();
+}
+
+void BFCAllocator::GetStats(AllocatorStats* stats) {
+  mutex_lock l(lock_);
+  *stats = stats_;
+}
+
+}  // namespace tensorflow
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@ -0,0 +1,413 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/allocator_retry.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// A memory allocator that implements a 'best-fit with coalescing'
+// algorithm.  This is essentially a very simple version of Doug Lea's
+// malloc (dlmalloc).
+//
+// The goal of this allocator is to support defragmentation via
+// coalescing.  One assumption we make is that the process using this
+// allocator owns pretty much all of the memory, and that nearly
+// all requests to allocate memory go through this interface.
+class BFCAllocator : public VisitableAllocator {
+ public:
+  // Takes ownership of sub_allocator.
+  BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
+               bool allow_growth, const string& name);
+  ~BFCAllocator() override;
+
+  string Name() override { return name_; }
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void* AllocateRaw(size_t alignment, size_t num_bytes,
+                    const AllocationAttributes& allocation_attr) override;
+  void DeallocateRaw(void* ptr) override;
+
+  void AddAllocVisitor(Visitor visitor) override;
+
+  // Does nothing, because memory is never freed.
+  void AddFreeVisitor(Visitor visitor) override {}
+
+  bool TracksAllocationSizes() override;
+
+  size_t RequestedSize(void* ptr) override;
+
+  size_t AllocatedSize(void* ptr) override;
+
+  int64 AllocationId(void* ptr) override;
+
+  void GetStats(AllocatorStats* stats) override;
+
+ private:
+  struct Bin;
+
+  void* AllocateRawInternal(size_t alignment, size_t num_bytes,
+                            bool dump_log_on_failure);
+  void DeallocateRawInternal(void* ptr);
+
+  // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
+  // kInvalidChunkHandle means an invalid chunk
+  typedef int ChunkHandle;
+  static const int kInvalidChunkHandle = -1;
+
+  typedef int BinNum;
+  static const int kInvalidBinNum = -1;
+  static const int kNumBins = 21;
+
+  // Chunks point to memory.  Their prev/next pointers form a
+  // doubly-linked list of addresses sorted by base address that
+  // must be contiguous.  Chunks contain information about whether
+  // they are in use or whether they are free, and contain a pointer
+  // to the bin they are in.
+  struct Chunk {
+    size_t size = 0;  // Full size of buffer.
+
+    // We sometimes give chunks that are larger than needed to reduce
+    // fragmentation.  requested_size keeps track of what the client
+    // actually wanted so we can understand whether our splitting
+    // strategy is efficient.
+    size_t requested_size = 0;
+
+    // allocation_id is set to -1 when the chunk is not in use. It is assigned a
+    // value greater than zero before the chunk is returned from
+    // AllocateRaw, and this value is unique among values assigned by
+    // the parent allocator.
+    int64 allocation_id = -1;
+    void* ptr = nullptr;  // pointer to granted subbuffer.
+
+    // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly
+    // preceding the memory used by this chunk.  E.g., It should start
+    // at 'ptr - prev->size'
+    ChunkHandle prev = kInvalidChunkHandle;
+
+    // If not kInvalidChunkHandle, the memory referred to by 'next' is directly
+    // following the memory used by this chunk.  E.g., It should be at
+    // 'ptr + size'
+    ChunkHandle next = kInvalidChunkHandle;
+
+    // What bin are we in?
+    BinNum bin_num = kInvalidBinNum;
+
+    bool in_use() const { return allocation_id != -1; }
+
+    string DebugString(BFCAllocator* a, bool recurse) {
+      string dbg;
+      strings::StrAppend(&dbg, "  Size: ", strings::HumanReadableNumBytes(size),
+                         " | Requested Size: ",
+                         strings::HumanReadableNumBytes(requested_size),
+                         " | in_use: ", in_use());
+      if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
+        Chunk* p = a->ChunkFromHandle(prev);
+        strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
+      }
+      if (recurse && next != BFCAllocator::kInvalidChunkHandle) {
+        Chunk* n = a->ChunkFromHandle(next);
+        strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
+      }
+      return dbg;
+    }
+  };
+
+  // A Bin is a collection of similar-sized free chunks.
+  struct Bin {
+    // All chunks in this bin have >= bin_size memory.
+    size_t bin_size = 0;
+
+    struct ChunkComparator {
+      explicit ChunkComparator(BFCAllocator* allocator)
+          : allocator_(allocator) {}
+      // Sort first by size and then use pointer address as a tie breaker.
+      bool operator()(const ChunkHandle ha, const ChunkHandle hb) const {
+        const Chunk* a = allocator_->ChunkFromHandle(ha);
+        const Chunk* b = allocator_->ChunkFromHandle(hb);
+        if (a->size != b->size) {
+          return a->size < b->size;
+        }
+        return a->ptr < b->ptr;
+      }
+
+     private:
+      BFCAllocator* allocator_;  // The parent allocator
+    };
+
+    typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet;
+    // List of free chunks within the bin, sorted by chunk size.
+    // Chunk * not owned.
+    FreeChunkSet free_chunks;
+    Bin(BFCAllocator* allocator, size_t bs)
+        : bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
+  };
+
+  static const size_t kMinAllocationBits = 8;
+  static const size_t kMinAllocationSize = 1 << kMinAllocationBits;
+
+  // AllocationRegion maps pointers to ChunkHandles for a single
+  // contiguous memory region.
+  //
+  // This class is thread-compatible.
+  class AllocationRegion {
+   public:
+    AllocationRegion(void* ptr, size_t memory_size)
+        : ptr_(ptr),
+          memory_size_(memory_size),
+          end_ptr_(
+              static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) {
+      DCHECK_EQ(0, memory_size % kMinAllocationSize);
+      const size_t n_handles =
+          (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
+      handles_ = new ChunkHandle[n_handles];
+      for (size_t i = 0; i < n_handles; i++) {
+        handles_[i] = kInvalidChunkHandle;
+      }
+    }
+
+    AllocationRegion() {}
+
+    ~AllocationRegion() { delete[] handles_; }
+
+    AllocationRegion(AllocationRegion&& other) { Swap(other); }
+
+    AllocationRegion& operator=(AllocationRegion&& other) {
+      Swap(other);
+      return *this;
+    }
+
+    void* ptr() const { return ptr_; }
+    void* end_ptr() const { return end_ptr_; }
+    size_t memory_size() const { return memory_size_; }
+    ChunkHandle get_handle(const void* p) const {
+      return handles_[IndexFor(p)];
+    }
+    void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; }
+    void erase(const void* p) { set_handle(p, kInvalidChunkHandle); }
+
+   private:
+    void Swap(AllocationRegion& other) {
+      std::swap(ptr_, other.ptr_);
+      std::swap(memory_size_, other.memory_size_);
+      std::swap(end_ptr_, other.end_ptr_);
+      std::swap(handles_, other.handles_);
+    }
+
+    int IndexFor(const void* p) const {
+      std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
+      std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
+      DCHECK_GE(p_int, base_int);
+      DCHECK_LT(p_int, base_int + memory_size_);
+      return static_cast<int>(((p_int - base_int) >> kMinAllocationBits));
+    }
+
+    // Metadata about the allocation region.
+    void* ptr_ = nullptr;
+    size_t memory_size_ = 0;
+    void* end_ptr_ = nullptr;
+
+    // Array of size "memory_size / kMinAllocationSize".  It is
+    // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
+    // for the memory allocation represented by "p"
+    ChunkHandle* handles_ = nullptr;
+
+    TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
+  };
+
+  // RegionManager aggregates one or more "AllocationRegions" and provides
+  // a layer of indirection from pointers to the underlying ChunkHandle,
+  // allowing allocation across multiple discontiguous memory regions.
+  //
+  // This class is thread-compatible.
+  class RegionManager {
+   public:
+    RegionManager() {}
+    ~RegionManager() {}
+
+    void AddAllocationRegion(void* ptr, size_t memory_size) {
+      // Insert sorted by end_ptr
+      auto entry =
+          std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
+      regions_.insert(entry, AllocationRegion(ptr, memory_size));
+    }
+
+    ChunkHandle get_handle(const void* p) const {
+      return RegionFor(p)->get_handle(p);
+    }
+
+    void set_handle(const void* p, ChunkHandle h) {
+      return MutableRegionFor(p)->set_handle(p, h);
+    }
+    void erase(const void* p) { return MutableRegionFor(p)->erase(p); }
+
+    const std::vector<AllocationRegion>& regions() const { return regions_; }
+
+   private:
+    static bool Comparator(const void* ptr, const AllocationRegion& other) {
+      return ptr < other.end_ptr();
+    }
+
+    AllocationRegion* MutableRegionFor(const void* p) {
+      return const_cast<AllocationRegion*>(RegionFor(p));
+    }
+
+    const AllocationRegion* RegionFor(const void* p) const {
+      auto entry =
+          std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator);
+
+      if (entry != regions_.end()) {
+        return &(*entry);
+      }
+
+      LOG(FATAL) << "Could not find Region for " << p;
+      return nullptr;
+    }
+
+   private:
+    std::vector<AllocationRegion> regions_;
+  };
+
+  // Returns 'bytes' rounded up to the next highest kMinAllocationSize.
+  size_t RoundedBytes(size_t bytes);
+
+  // Try to add a new memory region that can satisfy an allocation of
+  // 'rounded_bytes' bytes.  Returns true on success and false on
+  // failure.
+  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Returns a pointer to an underlying allocated chunk of size
+  // 'rounded_bytes'.
+  void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Splits the chunk specified by 'h' into two chunks, one at least
+  // of size 'num_bytes'.
+  void SplitChunk(ChunkHandle h, size_t num_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Merges the two chunk handles.  Requires that the chunks are
+  // contiguous in their allocation.
+  void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Frees the memory represented by 'h', coalescing the chunk if
+  // possible.
+  void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Adds the chunk 'h' to the proper free bin.
+  void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Removes the free chunk pointed to by 'c' from the set free_chunks.
+  void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
+                                  const Bin::FreeChunkSet::iterator& c)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Removes a free chunk from the bin.
+  void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Removes the chunk metadata represented by 'h'.
+  void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  AllocatorRetry retry_helper_;
+
+  // Structures immutable after construction
+  size_t memory_limit_ = 0;
+  inline int Log2FloorNonZero(uint64 n) {
+#if defined(__GNUC__)
+    return 63 ^ __builtin_clzll(n);
+#else
+    int r = 0;
+    while (n > 0) {
+      r++;
+      n >>= 1;
+    }
+    return r;
+#endif
+  }
+
+  // Map from bin size to Bin
+  Bin* BinFromIndex(BinNum index) {
+    return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
+  }
+  size_t BinNumToSize(BinNum index) {
+    return static_cast<size_t>(256) << index;
+  }
+  BinNum BinNumForSize(size_t bytes) {
+    uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
+    int b = std::min(kNumBins - 1, Log2FloorNonZero(v));
+    return b;
+  }
+  Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); }
+
+  char bins_space_[sizeof(Bin) * kNumBins];
+
+  // The size of the current region allocation.
+  size_t curr_region_allocation_bytes_;
+
+  // The total number of allocated bytes by the allocator.
+  size_t total_region_allocated_bytes_ = 0;
+
+  // An indicator that expansion of a region has hit the limits
+  // of the available memory.
+  bool started_backpedal_ = false;
+
+  std::unique_ptr<SubAllocator> suballocator_;
+  string name_;
+
+  // Structures mutable after construction
+  mutable mutex lock_;
+  RegionManager region_manager_ GUARDED_BY(lock_);
+
+  std::vector<Chunk> chunks_;
+  ChunkHandle free_chunks_list_;  // Ptr to head of linked list of free Chunks
+
+  // Called once on each region, ASAP.
+  std::vector<Visitor> region_visitors_;
+
+  // Counter containing the next unique identifier to assign to a
+  // newly-created chunk.
+  int64 next_allocation_id_ GUARDED_BY(lock_);
+
+  // Stats.
+  AllocatorStats stats_ GUARDED_BY(lock_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@ -1170,37 +1170,44 @@ FunctionBody* SymbolicGradientHelper::Compute() {
  Copy();

  Graph* g = gbody_->graph;
+
+  const int num_y = gbody_->ret_nodes.size();
+
+  // Populate 'y_node_outputs_' with node function body outputs.
  // Populate 'y_grad_nodes' with initial gradient nodes for each return node of
  // the original function body (these will be 'arg' nodes in the function
  // gradient body).
-  const int num_y = gbody_->ret_nodes.size();
-  std::vector<Node*> y_grad_nodes;
-  y_grad_nodes.reserve(num_y);
+  std::vector<NodeOut> y_node_outputs;
+  y_node_outputs.reserve(num_y);
+  std::vector<NodeOut> y_grad_node_outputs;
+  y_grad_node_outputs.reserve(num_y);
  for (int i = 0; i < num_y; ++i) {
    Node* y = gbody_->ret_nodes[i];
+    y_node_outputs.push_back({y, 0});
    DCHECK_EQ(y->type_string(), kRetOp);
    const DataType dtype = y->input_type(0);
    const int index = gbody_->arg_nodes.size();
    Node* dy = AddArg(g, dtype, index);
    gbody_->arg_types.push_back(dtype);
    gbody_->arg_nodes.push_back(dy);
-    y_grad_nodes.push_back(dy);
+    y_grad_node_outputs.push_back({dy, 0});
  }

-  // Populate 'x_nodes' with function args (not including 'y_grad_nodes').
+  // Populate 'x_nodes' with function args (excluding 'y_grad_node_outputs').
  const int num_x = fbody_->arg_nodes.size();
-  std::vector<Node*> x_nodes;
-  x_nodes.reserve(num_x);
+  std::vector<NodeOut> x_node_outputs;
+  x_node_outputs.reserve(num_x);
  for (size_t i = 0; i < fbody_->arg_nodes.size(); ++i) {
-    x_nodes.push_back(gbody_->arg_nodes[i]);
+    x_node_outputs.push_back({gbody_->arg_nodes[i], 0});
  }

  // Call AddSymbolicGradients which will add nodes to graph 'g' that
-  // compute the function gradient (adding an entry in 'x_grad_nodes' for
-  // each node in 'x_nodes').
-  std::vector<GradNodeOutput> x_grad_nodes(x_nodes.size());
-  TF_CHECK_OK(AddSymbolicGradients(gbody_->ret_nodes, x_nodes, y_grad_nodes,
-                                   &x_grad_nodes, g));
+  // compute the function gradient (adding an entry in 'x_grad_node_outputs' for
+  // each node in 'x_node_outputs').
+  std::vector<NodeOut> x_grad_node_outputs;
+  TF_CHECK_OK(AddSymbolicGradients(y_node_outputs, x_node_outputs,
+                                   y_grad_node_outputs, &x_grad_node_outputs,
+                                   g));

  // Remove the old return nodes from the function body.
  for (Node* n : gbody_->ret_nodes) {
@ -1211,7 +1218,7 @@ FunctionBody* SymbolicGradientHelper::Compute() {
  // Add new return nodes to the function gradient body for each node
  // in 'x_grad_nodes'.
  for (size_t i = 0; i < fbody_->arg_types.size(); ++i) {
-    Endpoint grad = {x_grad_nodes[i].node, x_grad_nodes[i].index};
+    Endpoint grad = {x_grad_node_outputs[i].node, x_grad_node_outputs[i].index};
    Node* ret = AddRet(g, grad, i);
    gbody_->ret_nodes.push_back(ret);
  }
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
+#include "tensorflow/core/common_runtime/allocator_retry.h"

 #include <vector>
 #include "tensorflow/core/lib/core/notification.h"
@ -55,7 +55,7 @@ class FakeAllocator {
  }

 private:
-  GPUAllocatorRetry retry_;
+  AllocatorRetry retry_;
  void* good_ptr_ = reinterpret_cast<void*>(0xdeadbeef);
  mutex mu_;
  size_t memory_capacity_ GUARDED_BY(mu_);
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@ -15,17 +15,7 @@ limitations under the License.

 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"

-#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
-#include "tensorflow/core/lib/core/bits.h"
-#include "tensorflow/core/lib/gtl/stl_util.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/platform/types.h"

 namespace gpu = ::perftools::gputools;

@ -36,680 +26,9 @@ GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory)

 GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory,
                                 const GPUOptions& gpu_options)
-    : device_id_(device_id),
-      free_chunks_list_(kInvalidChunkHandle),
-      next_allocation_id_(1) {
-  // Get a pointer to the stream_executor for this device
-  stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
-
-  if (gpu_options.allow_growth()) {
-    // 1MiB smallest initial allocation, unless total memory available
-    // is less.
-    curr_region_allocation_bytes_ =
-        RoundedBytes(std::min(total_memory, size_t{1048576}));
-  } else {
-    curr_region_allocation_bytes_ = RoundedBytes(total_memory);
-  }
-
-  // Allocate the requested amount of memory.
-  gpu_memory_size_ = total_memory;
-  stats_.bytes_limit = static_cast<int64>(total_memory);
-
-  // Create a bunch of bins of various good sizes.
-
-  // We create bins to fit all possible ranges that cover the
-  // gpu_memory_size_ starting from allocations up to 256 bytes to
-  // allocations up to (and including) the memory limit.
-  for (BinNum b = 0; b < kNumBins; b++) {
-    size_t bin_size = BinNumToSize(b);
-    VLOG(1) << "Creating bin of max chunk size "
-            << strings::HumanReadableNumBytes(bin_size);
-    new (BinFromIndex(b)) Bin(this, bin_size);
-    CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
-    CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b));
-    CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b));
-    if (b + 1 < kNumBins) {
-      CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b));
-    }
-  }
-}
-
-GPUBFCAllocator::~GPUBFCAllocator() {
-  // Return memory back.
-  VLOG(2) << "Number of regions allocated: "
-          << region_manager_.regions().size();
-  for (const auto& region : region_manager_.regions()) {
-    gpu::DeviceMemoryBase gpu_ptr{region.ptr()};
-    stream_exec_->Deallocate(&gpu_ptr);
-  }
-
-  for (BinNum b = 0; b < kNumBins; b++) {
-    BinFromIndex(b)->~Bin();
-  }
-}
-
-GPUBFCAllocator::Chunk* GPUBFCAllocator::ChunkFromHandle(ChunkHandle h) {
-  DCHECK_GE(h, 0);
-  DCHECK_LT(h, static_cast<int>(chunks_.size()));
-  return &(chunks_[h]);
-}
-
-bool GPUBFCAllocator::Extend(size_t rounded_bytes) {
-  // Do we have enough space to handle the client's request?
-  // If not, fail immediately.
-  if (total_region_allocated_bytes_ + rounded_bytes > gpu_memory_size_) {
-    return false;
-  }
-
-  // If curr_region_allocation_bytes_ is not enough to satisfy the
-  // allocation, keep multiplying by a power of two until that is
-  // sufficient.
-  bool increased_allocation = false;
-  while (rounded_bytes > curr_region_allocation_bytes_) {
-    curr_region_allocation_bytes_ *= 2;
-    increased_allocation = true;
-  }
-
-  // Try allocating.
-  size_t bytes = curr_region_allocation_bytes_;
-  gpu::DeviceMemory<char> gpu_mem = stream_exec_->AllocateArray<char>(bytes);
-  if (gpu_mem == nullptr && !started_backpedal_) {
-    // Only backpedal once.
-    started_backpedal_ = true;
-
-    static constexpr float kBackpedalFactor = 0.9;
-
-    // Try allocating less memory.
-    bytes = RoundedBytes(bytes * kBackpedalFactor);
-    while (gpu_mem == nullptr && bytes > rounded_bytes) {
-      gpu_mem = stream_exec_->AllocateArray<char>(bytes);
-      bytes = RoundedBytes(bytes * kBackpedalFactor);
-    }
-  }
-
-  if (gpu_mem == nullptr) {
-    return false;
-  }
-
-  if (!increased_allocation) {
-    // Increase the region size of the next required allocation.
-    curr_region_allocation_bytes_ *= 2;
-  }
-
-  VLOG(1) << "Extending allocation by " << strings::HumanReadableNumBytes(bytes)
-          << " bytes.";
-
-  total_region_allocated_bytes_ += bytes;
-  VLOG(1) << "Total allocated bytes: "
-          << strings::HumanReadableNumBytes(total_region_allocated_bytes_);
-
-  void* gpu_mem_base = gpu_mem.opaque();
-  VLOG(1) << "Allocated memory at " << gpu_mem_base << " to "
-          << static_cast<void*>(static_cast<char*>(gpu_mem_base) + bytes);
-  region_manager_.AddAllocationRegion(gpu_mem_base, bytes);
-
-  // Create one large chunk for the whole memory space that will
-  // be chunked later.
-  ChunkHandle h = AllocateChunk();
-  GPUBFCAllocator::Chunk* c = ChunkFromHandle(h);
-  c->ptr = gpu_mem_base;
-  c->size = bytes;
-  c->allocation_id = -1;
-  c->prev = kInvalidChunkHandle;
-  c->next = kInvalidChunkHandle;
-
-  region_manager_.set_handle(c->ptr, h);
-
-  // TODO(vrv): Try to merge this new region with an existing region,
-  // if the address space is contiguous, to avoid fragmentation
-  // across regions.
-
-  // Insert the chunk into the right bin.
-  InsertFreeChunkIntoBin(h);
-
-  // Invoke visitors on newly allocated region.
-  for (auto visitor : region_visitors_) {
-    visitor(gpu_mem_base, bytes);
-  }
-  return true;
-}
-
-GPUBFCAllocator::ChunkHandle GPUBFCAllocator::AllocateChunk() {
-  if (free_chunks_list_ != kInvalidChunkHandle) {
-    ChunkHandle h = free_chunks_list_;
-    Chunk* c = ChunkFromHandle(h);
-    free_chunks_list_ = c->next;
-    return h;
-  } else {
-    ChunkHandle h = chunks_.size();
-    chunks_.resize(h + 1);
-    return h;
-  }
-}
-
-void GPUBFCAllocator::DeallocateChunk(ChunkHandle h) {
-  Chunk* c = ChunkFromHandle(h);
-  c->next = free_chunks_list_;
-  free_chunks_list_ = h;
-}
-
-void* GPUBFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) {
-  // Fast path: Try once to allocate without getting the retry_helper_ involved
-  void* r = AllocateRawInternal(unused_alignment, num_bytes, false);
-  if (r != nullptr) {
-    return r;
-  } else {
-    static const int64 kMaxMillisToWait = 10000;  // 10 seconds
-    return retry_helper_.AllocateRaw(
-        [this](size_t a, size_t nb, bool v) {
-          return AllocateRawInternal(a, nb, v);
-        },
-        kMaxMillisToWait, unused_alignment, num_bytes);
-  }
-}
-
-void* GPUBFCAllocator::AllocateRaw(
-    size_t unused_alignment, size_t num_bytes,
-    const AllocationAttributes& allocation_attr) {
-  if (allocation_attr.no_retry_on_failure) {
-    // Return immediately upon the first failure if this is for allocating an
-    // optional scratch space.
-    void* result = AllocateRawInternal(unused_alignment, num_bytes, false);
-    if (result == nullptr) {
-      // The counter incrementing is not thread-safe. But we don't really care.
-      // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N for
-      // more general usage.
-      static int log_counter = 0;
-      if (log_counter < 10) {
-        log_counter++;
-        LOG(WARNING)
-            << "Ran out of memory trying to allocate "
-            << strings::HumanReadableNumBytes(num_bytes)
-            << ". The caller indicates that this is not a failure, but"
-            << " may mean that there could be performance gains if more"
-            << " memory is available.";
-      }
-    }
-    return result;
-  } else {
-    return AllocateRaw(unused_alignment, num_bytes);
-  }
-}
-
-// static
-size_t GPUBFCAllocator::RoundedBytes(size_t bytes) {
-  size_t rounded_bytes =
-      (kMinAllocationSize *
-       ((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
-  DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize);
-  return rounded_bytes;
-}
-
-void* GPUBFCAllocator::AllocateRawInternal(size_t unused_alignment,
-                                           size_t num_bytes,
-                                           bool dump_log_on_failure) {
-  if (num_bytes == 0) {
-    LOG(ERROR) << "tried to allocate 0 bytes";
-    return nullptr;
-  }
-  // First, always allocate memory of at least kMinAllocationSize
-  // bytes, and always allocate multiples of kMinAllocationSize bytes
-  // so all memory addresses are nicely byte aligned.
-  size_t rounded_bytes = RoundedBytes(num_bytes);
-
-  // The BFC allocator tries to find the best fit first.
-  BinNum bin_num = BinNumForSize(rounded_bytes);
-
-  mutex_lock l(lock_);
-  void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
-  if (ptr != nullptr) {
-    return ptr;
-  }
-
-  // Try to extend
-  if (Extend(rounded_bytes)) {
-    ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
-    if (ptr != nullptr) {
-      return ptr;
-    }
-  }
-
-  // We searched all bins for an existing free chunk to use and
-  // couldn't find one.  This means we must have run out of memory,
-  // Dump the memory log for analysis.
-  if (dump_log_on_failure) {
-    DumpMemoryLog(rounded_bytes);
-    LOG(WARNING) << RenderOccupancy();
-    LOG(WARNING) << "Ran out of memory trying to allocate "
-                 << strings::HumanReadableNumBytes(num_bytes)
-                 << ".  See logs for memory state.";
-  }
-  return nullptr;
-}
-
-void* GPUBFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
-                                    size_t num_bytes) {
-  // First identify the first bin that could satisfy rounded_bytes.
-  for (; bin_num < kNumBins; bin_num++) {
-    // Start searching from the first bin for the smallest chunk that fits
-    // rounded_bytes.
-    Bin* b = BinFromIndex(bin_num);
-    for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
-         ++citer) {
-      const GPUBFCAllocator::ChunkHandle h = (*citer);
-      GPUBFCAllocator::Chunk* chunk = ChunkFromHandle(h);
-      DCHECK(!chunk->in_use());
-      if (chunk->size >= rounded_bytes) {
-        // We found an existing chunk that fits us that wasn't in use, so remove
-        // it from the free bin structure prior to using.
-        RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
-
-        // If we can break the size of the chunk into two reasonably
-        // large pieces, do so.
-        //
-        // TODO(vrv): What should be the criteria when deciding when
-        // to split?
-        if (chunk->size >= rounded_bytes * 2) {
-          SplitChunk(h, rounded_bytes);
-          chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
-        }
-
-        // The requested size of the returned chunk is what the user
-        // has allocated.
-        chunk->requested_size = num_bytes;
-        // Assign a unique id and increment the id counter, marking the
-        // chunk as being in use.
-        chunk->allocation_id = next_allocation_id_++;
-
-        // Update stats.
-        ++stats_.num_allocs;
-        stats_.bytes_in_use += chunk->size;
-        stats_.max_bytes_in_use =
-            std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
-        stats_.max_alloc_size =
-            std::max<std::size_t>(stats_.max_alloc_size, chunk->size);
-
-        VLOG(4) << "Returning: " << chunk->ptr;
-        if (VLOG_IS_ON(4)) {
-          LOG(INFO) << "A: " << RenderOccupancy();
-        }
-        return chunk->ptr;
-      }
-    }
-  }
-
-  return nullptr;
-}
-
-void GPUBFCAllocator::SplitChunk(GPUBFCAllocator::ChunkHandle h,
-                                 size_t num_bytes) {
-  // Allocate the new chunk before we do any ChunkFromHandle
-  ChunkHandle h_new_chunk = AllocateChunk();
-
-  Chunk* c = ChunkFromHandle(h);
-  CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
-
-  // Create a new chunk starting num_bytes after c
-  GPUBFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
-  new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
-  region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
-
-  // Set the new sizes of the chunks.
-  new_chunk->size = c->size - num_bytes;
-  c->size = num_bytes;
-
-  // The new chunk is not in use.
-  new_chunk->allocation_id = -1;
-
-  // Maintain the pointers.
-  // c <-> c_neighbor becomes
-  // c <-> new_chunk <-> c_neighbor
-  GPUBFCAllocator::ChunkHandle h_neighbor = c->next;
-  new_chunk->prev = h;
-  new_chunk->next = h_neighbor;
-  c->next = h_new_chunk;
-  if (h_neighbor != kInvalidChunkHandle) {
-    Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
-    c_neighbor->prev = h_new_chunk;
-  }
-
-  // Add the newly free chunk to the free bin.
-  InsertFreeChunkIntoBin(h_new_chunk);
-}
-
-void GPUBFCAllocator::DeallocateRaw(void* ptr) {
-  DeallocateRawInternal(ptr);
-  retry_helper_.NotifyDealloc();
-}
-
-void GPUBFCAllocator::DeallocateRawInternal(void* ptr) {
-  if (ptr == nullptr) {
-    LOG(ERROR) << "tried to deallocate nullptr";
-    return;
-  }
-  mutex_lock l(lock_);
-
-  // Find the chunk from the ptr.
-  GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
-  CHECK(h != kInvalidChunkHandle);
-
-  // Consider coalescing it.
-  FreeAndMaybeCoalesce(h);
-
-  if (VLOG_IS_ON(4)) {
-    LOG(INFO) << "F: " << RenderOccupancy();
-  }
-}
-
-// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
-// We merge Chunk(h2) into Chunk(h1).
-void GPUBFCAllocator::Merge(GPUBFCAllocator::ChunkHandle h1,
-                            GPUBFCAllocator::ChunkHandle h2) {
-  Chunk* c1 = ChunkFromHandle(h1);
-  Chunk* c2 = ChunkFromHandle(h2);
-  // We can only merge chunks that are not in use.
-  CHECK(!c1->in_use() && !c2->in_use());
-
-  // c1's prev doesn't change, still points to the same ptr, and is
-  // still not in use.
-
-  // Fix up neighbor pointers
-  //
-  // c1 <-> c2 <-> c3 should become
-  // c1 <-> c3
-
-  GPUBFCAllocator::ChunkHandle h3 = c2->next;
-  c1->next = h3;
-  CHECK(c2->prev == h1);
-  if (h3 != kInvalidChunkHandle) {
-    GPUBFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
-    c3->prev = h1;
-  }
-
-  // Set the new size
-  c1->size += c2->size;
-
-  DeleteChunk(h2);
-}
-
-void GPUBFCAllocator::DeleteChunk(ChunkHandle h) {
-  // Delete h and cleanup all state
-  Chunk* c = ChunkFromHandle(h);
-  //  VLOG(4) << "Removing: " << c->ptr;
-  region_manager_.erase(c->ptr);
-  DeallocateChunk(h);
-}
-
-void GPUBFCAllocator::InsertFreeChunkIntoBin(GPUBFCAllocator::ChunkHandle h) {
-  Chunk* c = ChunkFromHandle(h);
-  CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
-  BinNum bin_num = BinNumForSize(c->size);
-  Bin* new_bin = BinFromIndex(bin_num);
-  c->bin_num = bin_num;
-  new_bin->free_chunks.insert(h);
-}
-
-void GPUBFCAllocator::RemoveFreeChunkIterFromBin(
-    GPUBFCAllocator::Bin::FreeChunkSet* free_chunks,
-    const GPUBFCAllocator::Bin::FreeChunkSet::iterator& citer) {
-  ChunkHandle h = *citer;
-  Chunk* c = ChunkFromHandle(h);
-  CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
-  free_chunks->erase(citer);
-  c->bin_num = kInvalidBinNum;
-}
-
-void GPUBFCAllocator::RemoveFreeChunkFromBin(GPUBFCAllocator::ChunkHandle h) {
-  Chunk* c = ChunkFromHandle(h);
-  CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
-  int count = BinFromIndex(c->bin_num)->free_chunks.erase(h);
-  CHECK(count > 0) << "Could not find chunk in bin";
-  c->bin_num = kInvalidBinNum;
-}
-
-void GPUBFCAllocator::FreeAndMaybeCoalesce(GPUBFCAllocator::ChunkHandle h) {
-  Chunk* c = ChunkFromHandle(h);
-  CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
-
-  // Mark the chunk as no longer in use
-  c->allocation_id = -1;
-
-  // Updates the stats.
-  stats_.bytes_in_use -= c->size;
-
-  // This chunk is no longer in-use, consider coalescing the chunk
-  // with adjacent chunks.
-  ChunkHandle chunk_to_reassign = h;
-
-  // If the next chunk is free, coalesce the two
-  if (c->next != kInvalidChunkHandle) {
-    Chunk* cnext = ChunkFromHandle(c->next);
-    if (!cnext->in_use()) {
-      //      VLOG(8) << "Chunk at " << cnext->ptr << " merging with c " <<
-      //      c->ptr;
-
-      chunk_to_reassign = h;
-
-      // Deletes c->next
-      RemoveFreeChunkFromBin(c->next);
-      Merge(h, ChunkFromHandle(h)->next);
-    }
-  }
-
-  // If the previous chunk is free, coalesce the two
-  c = ChunkFromHandle(h);
-  if (c->prev != kInvalidChunkHandle) {
-    Chunk* cprev = ChunkFromHandle(c->prev);
-    if (!cprev->in_use()) {
-      //      VLOG(8) << "Chunk at " << c->ptr << " merging into c->prev "
-      //       << cprev->ptr;
-
-      chunk_to_reassign = c->prev;
-
-      // Deletes c
-      RemoveFreeChunkFromBin(c->prev);
-      Merge(ChunkFromHandle(h)->prev, h);
-      c = ChunkFromHandle(h);
-    }
-  }
-
-  InsertFreeChunkIntoBin(chunk_to_reassign);
-}
-
-void GPUBFCAllocator::AddAllocVisitor(Visitor visitor) {
-  VLOG(1) << "AddVisitor";
-  mutex_lock l(lock_);
-  region_visitors_.push_back(visitor);
-  for (const auto& region : region_manager_.regions()) {
-    visitor(region.ptr(), region.memory_size());
-  }
-}
-
-bool GPUBFCAllocator::TracksAllocationSizes() { return true; }
-
-size_t GPUBFCAllocator::RequestedSize(void* ptr) {
-  mutex_lock l(lock_);
-  GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
-  CHECK(h != kInvalidChunkHandle)
-      << "Asked for requested size of pointer we never allocated: " << ptr;
-  GPUBFCAllocator::Chunk* c = ChunkFromHandle(h);
-  return c->requested_size;
-}
-
-size_t GPUBFCAllocator::AllocatedSize(void* ptr) {
-  mutex_lock l(lock_);
-  GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
-  CHECK(h != kInvalidChunkHandle)
-      << "Asked for allocated size of pointer we never allocated: " << ptr;
-  GPUBFCAllocator::Chunk* c = ChunkFromHandle(h);
-  return c->size;
-}
-
-int64 GPUBFCAllocator::AllocationId(void* ptr) {
-  mutex_lock l(lock_);
-  GPUBFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
-  CHECK(h != kInvalidChunkHandle)
-      << "Asked for allocation id of pointer we never allocated: " << ptr;
-  GPUBFCAllocator::Chunk* c = ChunkFromHandle(h);
-  return c->allocation_id;
-}
-
-namespace {
-
-void RenderRegion(char* rendered, const size_t resolution,
-                  const size_t total_render_size, const size_t offset,
-                  const void* base_ptr, const void* ptr, const size_t size,
-                  const char c) {
-  const char* base_ptr_c = static_cast<const char*>(base_ptr);
-  const char* ptr_c = static_cast<const char*>(ptr);
-
-  size_t start_location =
-      ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
-  CHECK_GE(start_location, 0);
-  CHECK_LT(start_location, resolution);
-  size_t end_location =
-      ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) /
-      total_render_size;
-  CHECK_GE(end_location, 0);
-  CHECK_LT(end_location, resolution);
-
-  for (size_t i = start_location; i <= end_location; ++i) {
-    rendered[i] = c;
-  }
-}
-
-}  // namespace
-
-string GPUBFCAllocator::RenderOccupancy() {
-  // Make a buffer for the ASCII-art representation.
-  const size_t resolution = 100;
-  char rendered[resolution];
-
-  // Compute the total region size to render over
-  size_t total_region_size = 0;
-  for (const auto& region : region_manager_.regions()) {
-    total_region_size += region.memory_size();
-  }
-
-  // Start out with everything empty
-  RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
-               total_region_size, '_');
-
-  size_t region_offset = 0;
-  for (const auto& region : region_manager_.regions()) {
-    ChunkHandle h = region_manager_.get_handle(region.ptr());
-    // Then render each chunk left to right.
-    while (h != kInvalidChunkHandle) {
-      Chunk* c = ChunkFromHandle(h);
-      if (c->in_use()) {
-        // Render the wasted space
-        size_t wasted = c->size - c->requested_size;
-        if (wasted > 0) {
-          RenderRegion(rendered, resolution, total_region_size,
-                       region_offset + c->requested_size, region.ptr(), c->ptr,
-                       wasted, 'x');
-        }
-        // Then the occupied space
-        RenderRegion(rendered, resolution, total_region_size, region_offset,
-                     region.ptr(), c->ptr, c->requested_size, '*');
-      }
-      h = c->next;
-    }
-    region_offset += region.memory_size();
-  }
-
-  return StringPiece(rendered, resolution).ToString();
-}
-
-void GPUBFCAllocator::DumpMemoryLog(size_t num_bytes) {
-  // For each bin: tally up the total number of chunks and bytes.
-  // Note that bins hold only free chunks.
-  for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
-    Bin* b = BinFromIndex(bin_num);
-
-    size_t total_bytes_in_use = 0;
-    size_t total_bytes_in_bin = 0;
-    size_t total_requested_bytes_in_use = 0;
-    size_t total_requested_bytes_in_bin = 0;
-    size_t total_chunks_in_use = 0;
-    size_t total_chunks_in_bin = 0;
-    for (ChunkHandle h : b->free_chunks) {
-      Chunk* c = ChunkFromHandle(h);
-      total_bytes_in_bin += c->size;
-      total_requested_bytes_in_bin += c->requested_size;
-      ++total_chunks_in_bin;
-      if (c->in_use()) {
-        total_bytes_in_use += c->size;
-        total_requested_bytes_in_use += c->requested_size;
-        ++total_chunks_in_use;
-      }
-    }
-
-    LOG(INFO) << "Bin (" << b->bin_size
-              << "): \tTotal Chunks: " << total_chunks_in_bin
-              << ", Chunks in use: " << total_chunks_in_use << " "
-              << strings::HumanReadableNumBytes(total_bytes_in_bin)
-              << " allocated for chunks. "
-              << strings::HumanReadableNumBytes(total_requested_bytes_in_bin)
-              << " client-requested for chunks. "
-              << strings::HumanReadableNumBytes(total_bytes_in_use)
-              << " in use in bin. "
-              << strings::HumanReadableNumBytes(total_requested_bytes_in_use)
-              << " client-requested in use in bin.";
-  }
-
-  // Find the bin that we would have liked to allocate in, so we
-  // can get some further analysis about fragmentation.
-  Bin* b = BinForSize(num_bytes);
-
-  LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
-            << " was " << strings::HumanReadableNumBytes(b->bin_size)
-            << ", Chunk State: ";
-
-  for (ChunkHandle h : b->free_chunks) {
-    Chunk* c = ChunkFromHandle(h);
-    LOG(INFO) << c->DebugString(this, true);
-  }
-
-  // Next show the chunks that are in use, and also summarize their
-  // number by size.
-  std::map<size_t, int> in_use_by_size;
-  for (const auto& region : region_manager_.regions()) {
-    ChunkHandle h = region_manager_.get_handle(region.ptr());
-    while (h != kInvalidChunkHandle) {
-      const Chunk* c = ChunkFromHandle(h);
-      if (c->in_use()) {
-        in_use_by_size[c->size]++;
-        LOG(INFO) << "Chunk at " << c->ptr << " of size " << c->size;
-      }
-      h = c->next;
-    }
-
-    h = region_manager_.get_handle(region.ptr());
-    while (h != kInvalidChunkHandle) {
-      const Chunk* c = ChunkFromHandle(h);
-      if (!c->in_use()) {
-        LOG(INFO) << "Free at " << c->ptr << " of size " << c->size;
-      }
-      h = c->next;
-    }
-  }
-
-  LOG(INFO) << "     Summary of in-use Chunks by size: ";
-  size_t total_bytes = 0;
-  for (auto& it : in_use_by_size) {
-    LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
-              << strings::HumanReadableNumBytes(it.first * it.second);
-    total_bytes += (it.first * it.second);
-  }
-  LOG(INFO) << "Sum Total of in-use chunks: "
-            << strings::HumanReadableNumBytes(total_bytes);
-  LOG(INFO) << "Stats: \n" << stats_.DebugString();
-}
-
-void GPUBFCAllocator::GetStats(AllocatorStats* stats) {
-  mutex_lock l(lock_);
-  *stats = stats_;
-}
+    : BFCAllocator(
+          new GPUMemAllocator(
+              GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie()),
+          total_memory, gpu_options.allow_growth(), "gpu_bfc") {}

 }  // namespace tensorflow
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@ -21,396 +21,62 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>

-#include "tensorflow/core/common_runtime/gpu/gpu_allocator_retry.h"
-#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
-#include "tensorflow/core/lib/gtl/stl_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/common_runtime/allocator_retry.h"
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"

+namespace gpu = ::perftools::gputools;
+
 namespace tensorflow {

 // A GPU memory allocator that implements a 'best-fit with coalescing'
-// algorithm.  This is essentially a very simple version of Doug Lea's
-// malloc (dlmalloc).
-//
-// The goal of this allocator is to support defragmentation via
-// coalescing.  One assumption we make is that the process using this
-// allocator owns pretty much all of the GPU memory, and that nearly
-// all requests to allocate GPU memory go through this interface.
-class GPUBFCAllocator : public VisitableAllocator {
+// algorithm.
+class GPUBFCAllocator : public BFCAllocator {
 public:
  // 'device_id' refers to the StreamExecutor ID of the device within
  // the process and must reference a valid ID in the process.
  GPUBFCAllocator(int device_id, size_t total_memory);
  GPUBFCAllocator(int device_id, size_t total_memory,
                  const GPUOptions& gpu_options);
-  ~GPUBFCAllocator() override;
-
-  string Name() override { return "gpu_bfc"; }
-  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
-  void* AllocateRaw(size_t alignment, size_t num_bytes,
-                    const AllocationAttributes& allocation_attr) override;
-  void DeallocateRaw(void* ptr) override;
-
-  void AddAllocVisitor(Visitor visitor) override;
-
-  // Does nothing, because gpu memory is never freed.
-  void AddFreeVisitor(Visitor visitor) override {}
-
-  bool TracksAllocationSizes() override;
-
-  size_t RequestedSize(void* ptr) override;
-
-  size_t AllocatedSize(void* ptr) override;
-
-  int64 AllocationId(void* ptr) override;
-
-  void GetStats(AllocatorStats* stats) override;
-
- private:
-  struct Bin;
-
-  void* AllocateRawInternal(size_t alignment, size_t num_bytes,
-                            bool dump_log_on_failure);
-  void DeallocateRawInternal(void* ptr);
-
-  // A ChunkHandle is an index into the chunks_ vector in GPUBFCAllocator
-  // kInvalidChunkHandle means an invalid chunk
-  typedef int ChunkHandle;
-  static const int kInvalidChunkHandle = -1;
-
-  typedef int BinNum;
-  static const int kInvalidBinNum = -1;
-  static const int kNumBins = 21;
-
-  // Chunks point to GPU memory.  Their prev/next pointers form a
-  // doubly-linked list of addresses sorted by GPU base address that
-  // must be contiguous.  Chunks contain information about whether
-  // they are in use or whether they are free, and contain a pointer
-  // to the bin they are in.
-  struct Chunk {
-    size_t size = 0;  // Full size of GPU buffer.
-
-    // We sometimes give chunks that are larger than needed to reduce
-    // fragmentation.  requested_size keeps track of what the client
-    // actually wanted so we can understand whether our splitting
-    // strategy is efficient.
-    size_t requested_size = 0;
-
-    // allocation_id is set to -1 when the chunk is not in use. It is assigned a
-    // value greater than zero before the chunk is returned from
-    // AllocateRaw, and this value is unique among values assigned by
-    // the parent allocator.
-    int64 allocation_id = -1;
-    void* ptr = nullptr;  // pointer to granted GPU subbuffer.
-
-    // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly
-    // preceding the memory used by this chunk.  E.g., It should start
-    // at 'ptr - prev->size'
-    ChunkHandle prev = kInvalidChunkHandle;
-
-    // If not kInvalidChunkHandle, the memory referred to by 'next' is directly
-    // following the memory used by this chunk.  E.g., It should be at
-    // 'ptr + size'
-    ChunkHandle next = kInvalidChunkHandle;
-
-    // What bin are we in?
-    BinNum bin_num = kInvalidBinNum;
-
-    bool in_use() const { return allocation_id != -1; }
-
-    string DebugString(GPUBFCAllocator* a, bool recurse) {
-      string dbg;
-      strings::StrAppend(&dbg, "  Size: ", strings::HumanReadableNumBytes(size),
-                         " | Requested Size: ",
-                         strings::HumanReadableNumBytes(requested_size),
-                         " | in_use: ", in_use());
-      if (recurse && prev != GPUBFCAllocator::kInvalidChunkHandle) {
-        Chunk* p = a->ChunkFromHandle(prev);
-        strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
-      }
-      if (recurse && next != GPUBFCAllocator::kInvalidChunkHandle) {
-        Chunk* n = a->ChunkFromHandle(next);
-        strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
-      }
-      return dbg;
-    }
-  };
-
-  // A Bin is a collection of similar-sized free chunks.
-  struct Bin {
-    // All chunks in this bin have >= bin_size memory.
-    size_t bin_size = 0;
-
-    struct ChunkComparator {
-      explicit ChunkComparator(GPUBFCAllocator* allocator)
-          : allocator_(allocator) {}
-      // Sort first by size and then use pointer address as a tie breaker.
-      bool operator()(const ChunkHandle ha, const ChunkHandle hb) const {
-        const Chunk* a = allocator_->ChunkFromHandle(ha);
-        const Chunk* b = allocator_->ChunkFromHandle(hb);
-        if (a->size != b->size) {
-          return a->size < b->size;
-        }
-        return a->ptr < b->ptr;
-      }
-
-     private:
-      GPUBFCAllocator* allocator_;  // The parent allocator
-    };
-
-    typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet;
-    // List of free chunks within the bin, sorted by chunk size.
-    // Chunk * not owned.
-    FreeChunkSet free_chunks;
-    Bin(GPUBFCAllocator* allocator, size_t bs)
-        : bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
-  };
-
-  static const size_t kMinAllocationBits = 8;
-  static const size_t kMinAllocationSize = 1 << kMinAllocationBits;
-
-  // AllocationRegion maps pointers to ChunkHandles for a single
-  // contiguous memory region.
-  //
-  // This class is thread-compatible.
-  class AllocationRegion {
-   public:
-    AllocationRegion(void* ptr, size_t memory_size)
-        : ptr_(ptr),
-          memory_size_(memory_size),
-          end_ptr_(
-              static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) {
-      DCHECK_EQ(0, memory_size % kMinAllocationSize);
-      const size_t n_handles =
-          (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
-      handles_ = new ChunkHandle[n_handles];
-      for (size_t i = 0; i < n_handles; i++) {
-        handles_[i] = kInvalidChunkHandle;
-      }
-    }
-
-    AllocationRegion() {}
-
-    ~AllocationRegion() { delete[] handles_; }
-
-    AllocationRegion(AllocationRegion&& other) { Swap(other); }
-
-    AllocationRegion& operator=(AllocationRegion&& other) {
-      Swap(other);
-      return *this;
-    }
-
-    void* ptr() const { return ptr_; }
-    void* end_ptr() const { return end_ptr_; }
-    size_t memory_size() const { return memory_size_; }
-    ChunkHandle get_handle(const void* p) const {
-      return handles_[IndexFor(p)];
-    }
-    void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; }
-    void erase(const void* p) { set_handle(p, kInvalidChunkHandle); }
-
-   private:
-    void Swap(AllocationRegion& other) {
-      std::swap(ptr_, other.ptr_);
-      std::swap(memory_size_, other.memory_size_);
-      std::swap(end_ptr_, other.end_ptr_);
-      std::swap(handles_, other.handles_);
-    }
-
-    int IndexFor(const void* p) const {
-      std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
-      std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
-      DCHECK_GE(p_int, base_int);
-      DCHECK_LT(p_int, base_int + memory_size_);
-      return static_cast<int>(((p_int - base_int) >> kMinAllocationBits));
-    }
-
-    // Metadata about the allocation region.
-    void* ptr_ = nullptr;
-    size_t memory_size_ = 0;
-    void* end_ptr_ = nullptr;
-
-    // Array of size "memory_size / kMinAllocationSize".  It is
-    // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
-    // for the memory allocation represented by "p"
-    ChunkHandle* handles_ = nullptr;
-
-    TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
-  };
-
-  // RegionManager aggregates one or more "AllocationRegions" and provides
-  // a layer of indirection from pointers to the underlying ChunkHandle,
-  // allowing allocation across multiple discontiguous memory regions.
-  //
-  // This class is thread-compatible.
-  class RegionManager {
-   public:
-    RegionManager() {}
-    ~RegionManager() {}
-
-    void AddAllocationRegion(void* ptr, size_t memory_size) {
-      // Insert sorted by end_ptr
-      auto entry =
-          std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
-      regions_.insert(entry, AllocationRegion(ptr, memory_size));
-    }
-
-    ChunkHandle get_handle(const void* p) const {
-      return RegionFor(p)->get_handle(p);
-    }
-
-    void set_handle(const void* p, ChunkHandle h) {
-      return MutableRegionFor(p)->set_handle(p, h);
-    }
-    void erase(const void* p) { return MutableRegionFor(p)->erase(p); }
-
-    const std::vector<AllocationRegion>& regions() const { return regions_; }
-
-   private:
-    static bool Comparator(const void* ptr, const AllocationRegion& other) {
-      return ptr < other.end_ptr();
-    }
-
-    AllocationRegion* MutableRegionFor(const void* p) {
-      return const_cast<AllocationRegion*>(RegionFor(p));
-    }
-
-    const AllocationRegion* RegionFor(const void* p) const {
-      auto entry =
-          std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator);
-
-      if (entry != regions_.end()) {
-        return &(*entry);
-      }
-
-      LOG(FATAL) << "Could not find Region for " << p;
-      return nullptr;
-    }
-
-   private:
-    std::vector<AllocationRegion> regions_;
-  };
-
-  // Returns 'bytes' rounded up to the next highest kMinAllocationSize.
-  size_t RoundedBytes(size_t bytes);
-
-  // Try to add a new memory region that can satisfy an allocation of
-  // 'rounded_bytes' bytes.  Returns true on success and false on
-  // failure.
-  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  // Returns a pointer to an underlying allocated chunk of size
-  // 'rounded_bytes'.
-  void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  // Splits the chunk specified by 'h' into two chunks, one at least
-  // of size 'num_bytes'.
-  void SplitChunk(ChunkHandle h, size_t num_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  // Merges the two chunk handles.  Requires that the chunks are
-  // contiguous in their allocation.
-  void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  // Frees the memory represented by 'h', coalescing the chunk if
-  // possible.
-  void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  // Adds the chunk 'h' to the proper free bin.
-  void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  // Removes the free chunk pointed to by 'c' from the set free_chunks.
-  void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
-                                  const Bin::FreeChunkSet::iterator& c)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  // Removes a free chunk from the bin.
-  void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  // Removes the chunk metadata represented by 'h'.
-  void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  GPUAllocatorRetry retry_helper_;
-
-  // Structures immutable after construction
-  const int device_id_;
-  size_t gpu_memory_size_ = 0;
-  inline int Log2FloorNonZero(uint64 n) {
-#if defined(__GNUC__)
-    return 63 ^ __builtin_clzll(n);
-#else
-    int r = 0;
-    while (n > 0) {
-      r++;
-      n >>= 1;
-    }
-    return r;
-#endif
-  }
-
-  // Map from bin size to Bin
-  Bin* BinFromIndex(BinNum index) {
-    return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
-  }
-  size_t BinNumToSize(BinNum index) {
-    return static_cast<size_t>(256) << index;
-  }
-  BinNum BinNumForSize(size_t bytes) {
-    uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
-    int b = std::min(kNumBins - 1, Log2FloorNonZero(v));
-    return b;
-  }
-  Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); }
-
-  char bins_space_[sizeof(Bin) * kNumBins];
-
-  perftools::gputools::StreamExecutor* stream_exec_;  // Not owned.
-
-  // The size of the current region allocation.
-  size_t curr_region_allocation_bytes_;
-
-  // The total number of allocated bytes by the allocator.
-  size_t total_region_allocated_bytes_ = 0;
-
-  // An indicator that expansion of a region has hit the limits
-  // of the available GPU memory.
-  bool started_backpedal_ = false;
-
-  // Structures mutable after construction
-  mutable mutex lock_;
-  RegionManager region_manager_ GUARDED_BY(lock_);
-
-  std::vector<Chunk> chunks_;
-  ChunkHandle free_chunks_list_;  // Ptr to head of linked list of free Chunks
-
-  // Called once on each region, ASAP.
-  std::vector<Visitor> region_visitors_;
-
-  // Counter containing the next unique identifier to assign to a
-  // newly-created chunk.
-  int64 next_allocation_id_ GUARDED_BY(lock_);
-
-  // Stats.
-  AllocatorStats stats_ GUARDED_BY(lock_);
+  virtual ~GPUBFCAllocator() {}

  TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
 };

+// Suballocator for GPU memory.
+class GPUMemAllocator : public SubAllocator {
+ public:
+  // Note: stream_exec cannot be null.
+  explicit GPUMemAllocator(perftools::gputools::StreamExecutor* stream_exec)
+      : stream_exec_(stream_exec) {
+    CHECK(stream_exec_ != nullptr);
+  }
+  ~GPUMemAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    void* ptr = nullptr;
+    if (num_bytes > 0) {
+      ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque();
+    }
+    return ptr;
+  }
+
+  void Free(void* ptr, size_t num_bytes) override {
+    if (ptr != nullptr) {
+      gpu::DeviceMemoryBase gpu_ptr(ptr);
+      stream_exec_->Deallocate(&gpu_ptr);
+    }
+  }
+
+ private:
+  perftools::gputools::StreamExecutor* stream_exec_;  // not owned, non-null
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator);
+};
+
 }  // namespace tensorflow

 #endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@ -20,7 +20,7 @@ limitations under the License.
 #include <string>
 #include <unordered_map>

-#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@ -226,30 +226,6 @@ TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
  }
 }

-// Running the polling loop should clear the queue, without an explict
-// poll call here, given a moderate delay.
-TEST(EventMgr, LongDelayedPolling) {
-  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  EventMgr em(stream_exec, GPUOptions());
-  TEST_EventMgrHelper th(&em);
-  EXPECT_EQ(0, th.queue_size());
-  EXPECT_EQ(0, th.free_size());
-  std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
-  CHECK(stream.get());
-  stream->Init();
-  for (int i = 0; i < 5; ++i) {
-    TensorReferenceVector* v = new TensorReferenceVector;
-    AddTensorReference(v, 100 * 1048576);
-    th.QueueTensors(stream.get(), v);
-    EXPECT_EQ(1 + i, th.queue_size());
-    EXPECT_EQ(0, th.free_size());
-  }
-  th.StartPollingLoop();
-  sleep(1);
-  EXPECT_EQ(0, th.queue_size());
-  EXPECT_EQ(5, th.free_size());
-}
-
 // Deleting the EventMgr when events are still pending should shut
 // down gracefully.
 TEST(EventMgr, NonEmptyShutdown) {
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@ -24,7 +24,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <vector>
-#include "tensorflow/core/common_runtime/gpu/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@ -35,14 +35,6 @@ limitations under the License.

 namespace tensorflow {

-// Interface of an object that does the underlying alloc/free of memory.
-class SubAllocator {
- public:
-  virtual ~SubAllocator() {}
-  virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
-  virtual void Free(void* ptr, size_t num_bytes) = 0;
-};
-
 // Interface of an object that rounds up integers.
 class RoundUpInterface {
 public:
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@ -187,9 +187,17 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
    gpu::Platform* gpu_platform = GPUMachineManager();
    gpu::StreamExecutor* se = gpu_platform->ExecutorForDevice(0).ValueOrDie();
    CHECK(se);
-    Allocator* allocator = new PoolAllocator(
-        100 /*pool_size_limit*/, true /*auto_resize*/,
-        new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host");
+    Allocator* allocator = nullptr;
+    static constexpr bool kCudaHostMemoryUseBFC = true;
+    if (kCudaHostMemoryUseBFC) {
+      allocator =
+          new BFCAllocator(new CUDAHostAllocator(se), 1LL << 36 /*64GB max*/,
+                           true /*allow_growth*/, "cuda_host_bfc" /*name*/);
+    } else {
+      allocator = new PoolAllocator(
+          100 /*pool_size_limit*/, true /*auto_resize*/,
+          new CUDAHostAllocator(se), new Pow2Rounder, "cuda_host");
+    }
    if (LogMemory::IsEnabled()) {
      // Wrap the allocator to track allocation ids for better logging
      // at the cost of performance.
--- a/tensorflow/core/common_runtime/simple_placer.cc
+++ b/tensorflow/core/common_runtime/simple_placer.cc
@ -315,11 +315,20 @@ class ColocationGraph {
            device_set_->FindMatchingDevices(specified_device_name,
                                             &devices_matching_nodedef);
            if (devices_matching_nodedef.empty()) {
+              // Sometimes it is almost impossible to understand the problem
+              // without a list of available devices.
+              std::vector<string> device_names;
+              for (const Device* device : device_set_->devices()) {
+                device_names.push_back(device->name());
+              }
+              std::sort(device_names.begin(), device_names.end());
+
              return errors::InvalidArgument(
                  "Could not satisfy explicit device specification '",
                  node->def().device(),
                  "' because no devices matching that specification "
-                  "are registered in this process");
+                  "are registered in this process; available devices: ",
+                  str_util::Join(device_names, ", "));
            } else if (specified_device_name.has_type) {
              return errors::InvalidArgument(
                  "Could not satisfy explicit device specification '",
--- a/tensorflow/core/common_runtime/gpu/visitable_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/visitable_allocator.h
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_
+#ifndef TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_

 #include <functional>
 #include "tensorflow/core/framework/allocator.h"
@ -42,4 +42,4 @@ class VisitableAllocator : public Allocator {
  virtual void AddFreeVisitor(Visitor visitor) = 0;
 };
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_VISITABLE_ALLOCATOR_H_
+#endif  // TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@ -292,6 +292,15 @@ Allocator* cpu_allocator();
 // AllocatorStats. By default, it's disabled.
 void EnableCPUAllocatorStats(bool enable);

+// Abstract interface of an object that does the underlying suballoc/free of
+// memory for a higher-level allocator.
+class SubAllocator {
+ public:
+  virtual ~SubAllocator() {}
+  virtual void* Alloc(size_t alignment, size_t num_bytes) = 0;
+  virtual void Free(void* ptr, size_t num_bytes) = 0;
+};
+
 }  // namespace tensorflow

 #endif  // TENSORFLOW_FRAMEWORK_ALLOCATOR_H_
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@ -38,6 +38,26 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
 #endif
 }

+TEST(AllocatorAttributesTest, AllCombos) {
+  for (bool on_host : {false, true}) {
+    for (bool nic_compatible : {false, true}) {
+      for (bool gpu_compatible : {false, true}) {
+        for (bool track_sizes : {false, true}) {
+          AllocatorAttributes aa;
+          aa.set_on_host(on_host);
+          aa.set_nic_compatible(nic_compatible);
+          aa.set_gpu_compatible(gpu_compatible);
+          aa.set_track_sizes(track_sizes);
+          EXPECT_EQ(on_host, aa.on_host());
+          EXPECT_EQ(nic_compatible, aa.nic_compatible());
+          EXPECT_EQ(gpu_compatible, aa.gpu_compatible());
+          EXPECT_EQ(track_sizes, aa.track_sizes());
+        }
+      }
+    }
+  }
+}
+
 TEST(CPUAllocatorTest, Simple) {
  EnableCPUAllocatorStats(true);
  Allocator* a = cpu_allocator();
--- a/tensorflow/core/graph/gradients.cc
+++ b/tensorflow/core/graph/gradients.cc
@ -40,37 +40,30 @@ static const char* const kRetOp = "_Retval";
 static const char* const kGradientOp = "SymbolicGradient";
 static const char* const kNodeLabel = "Func";

-// Represents the index-th output of a node.
-struct Endpoint {
-  Node* node;
-  int index;
-
-  // Returns the string name represents this endpoint.
-  string name() const {
-    if (index == 0) {
-      return node->name();
-    } else {
-      return strings::StrCat(node->name(), ":", index);
-    }
+string NodeOut::name() const {
+  if (index == 0) {
+    return node->name();
+  } else {
+    return strings::StrCat(node->name(), ":", index);
  }
+}

-  DataType dtype() const { return node->output_type(index); }
-};
+DataType NodeOut::dtype() const { return node->output_type(index); }

-struct EndpointHash {
-  uint64 operator()(const Endpoint& x) const {
+struct NodeOutHash {
+  uint64 operator()(const NodeOut& x) const {
    return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*),
                  x.index);
  }
 };

-struct EndpointEq {
-  bool operator()(const Endpoint& x, const Endpoint& y) const {
+struct NodeOutEq {
+  bool operator()(const NodeOut& x, const NodeOut& y) const {
    return (x.node == y.node) && (x.index == y.index);
  }
 };

-static Node* AddZerosLike(Graph* g, Endpoint input) {
+static Node* AddZerosLike(Graph* g, NodeOut input) {
  DCHECK_LT(0, input.dtype());
  DCHECK_LT(input.dtype(), DT_FLOAT_REF);
  NodeDef ndef;
@ -85,7 +78,7 @@ static Node* AddZerosLike(Graph* g, Endpoint input) {
  return ret;
 }

-static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<Endpoint> grads) {
+static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<NodeOut> grads) {
  const int num_x = n->num_inputs();
  const int num_y = n->num_outputs();
  CHECK_EQ(num_y, grads.size());
@ -95,19 +88,19 @@ static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<Endpoint> grads) {
  ndef.set_op(kGradientOp);

  // The gradient node should have num_x + num_y inputs.
-  std::vector<Endpoint> n_inputs(num_x);
+  std::vector<NodeOut> n_inputs(num_x);
  for (const Edge* e : n->in_edges()) {
    if (e->IsControlEdge()) continue;
    n_inputs[e->dst_input()] = {e->src(), e->src_output()};
  }
  DataTypeVector in_types;
-  for (const Endpoint& ep : n_inputs) {
-    ndef.add_input(ep.name());
-    in_types.push_back(ep.dtype());
+  for (const NodeOut& nout : n_inputs) {
+    ndef.add_input(nout.name());
+    in_types.push_back(nout.dtype());
  }
-  for (const Endpoint& ep : grads) {
-    ndef.add_input(ep.name());
-    in_types.push_back(ep.dtype());
+  for (const NodeOut& nout : grads) {
+    ndef.add_input(nout.name());
+    in_types.push_back(nout.dtype());
  }
  CHECK_EQ(ndef.input_size(), num_x + num_y);

@ -128,34 +121,34 @@ static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<Endpoint> grads) {

 class SymbolicGradientBuilder {
 public:
-  SymbolicGradientBuilder(gtl::ArraySlice<Node*> y_nodes,
-                          gtl::ArraySlice<Node*> x_nodes,
-                          gtl::ArraySlice<Node*> y_grad_nodes,
-                          std::vector<GradNodeOutput>* x_grad_nodes,
+  SymbolicGradientBuilder(gtl::ArraySlice<NodeOut> y_node_outputs,
+                          gtl::ArraySlice<NodeOut> x_node_outputs,
+                          gtl::ArraySlice<NodeOut> y_grad_node_outputs,
+                          std::vector<NodeOut>* x_grad_node_outputs,
                          Graph* graph);

  Status Compute();

 private:
-  gtl::ArraySlice<Node*> y_nodes_;
-  gtl::ArraySlice<Node*> x_nodes_;
-  gtl::ArraySlice<Node*> y_grad_nodes_;
-  std::vector<GradNodeOutput>* x_grad_nodes_;
+  gtl::ArraySlice<NodeOut> y_node_outputs_;
+  gtl::ArraySlice<NodeOut> x_node_outputs_;
+  gtl::ArraySlice<NodeOut> y_grad_node_outputs_;
+  std::vector<NodeOut>* x_grad_node_outputs_;
  Graph* graph_;  // Not owned.

  // A vector of output endpoints which represents backpropagated
  // gradients
-  typedef std::vector<Endpoint> BackpropedGradients;
+  typedef std::vector<NodeOut> BackpropedGradients;

-  // backprops_ is a map from an output endpoint to its accumulated
-  // gradients.  When an output endpoint has accumulated all its
+  // backprops_ is a map from a node output to its accumulated
+  // gradients.  When a node output has accumulated all its
  // gradients, we add a node which sums them up.
-  std::unordered_map<Endpoint, BackpropedGradients, EndpointHash, EndpointEq>
+  std::unordered_map<NodeOut, BackpropedGradients, NodeOutHash, NodeOutEq>
      backprops_;

  // pending[i] is count-down counter for i-th node's expected
  // backprops.  When pending[i] becomes zero, we collected all
-  // backprop gradients for all output endpoint of the ith-node.
+  // backprop gradients for all outputs of the ith-node.
  std::vector<int> pending_;

  // 'ready' keeps track of nodes that have been completely
@ -163,7 +156,8 @@ class SymbolicGradientBuilder {
  // add dy as an input of the gradient function.
  std::deque<Node*> ready_;

-  // The set of nodes at which to stop backprop (and populate 'x_grad_nodes_').
+  // The set of nodes at which to stop backprop.
+  // Maps from node.id -> index of 'x_node_outputs_'
  std::unordered_map<int, int> stop_nodes_;

  // Initialize pending_ and ready_.
@ -173,33 +167,35 @@ class SymbolicGradientBuilder {
  // to 'dst', when the backprop algorithm constructs the node
  // 'dst_grad' which computes the gradient, we need to propagate it
  // to 'src'.
-  void BackpropAlongEdge(const Endpoint& dst_grad, const Endpoint& src);
-  void BackpropZerosAlongEdge(const Endpoint& src);
+  void BackpropAlongEdge(const NodeOut& dst_grad, const NodeOut& src);
+  void BackpropZerosAlongEdge(const NodeOut& src);

-  Endpoint SumGradients(const Endpoint& src);
+  NodeOut SumGradients(const NodeOut& src);

  TF_DISALLOW_COPY_AND_ASSIGN(SymbolicGradientBuilder);
 };

 SymbolicGradientBuilder::SymbolicGradientBuilder(
-    gtl::ArraySlice<Node*> y_nodes,
-    gtl::ArraySlice<Node*> x_nodes,
-    gtl::ArraySlice<Node*> y_grad_nodes,
-    std::vector<GradNodeOutput>* x_grad_nodes,
-    Graph* graph) : y_nodes_(y_nodes), x_nodes_(x_nodes),
-                    y_grad_nodes_(y_grad_nodes), x_grad_nodes_(x_grad_nodes),
-                    graph_(graph)  {
-  CHECK_EQ(y_nodes_.size(), y_grad_nodes.size());
-  x_grad_nodes_->clear();
-  x_grad_nodes_->resize(x_nodes_.size());
-  stop_nodes_.reserve(x_nodes_.size());
-  for (int i = 0; i < x_nodes_.size(); ++i) {
-    stop_nodes_.insert(std::make_pair(x_nodes_[i]->id(), i));
+    gtl::ArraySlice<NodeOut> y_node_outputs,
+    gtl::ArraySlice<NodeOut> x_node_outputs,
+    gtl::ArraySlice<NodeOut> y_grad_node_outputs,
+    std::vector<NodeOut>* x_grad_node_outputs, Graph* graph)
+    : y_node_outputs_(y_node_outputs),
+      x_node_outputs_(x_node_outputs),
+      y_grad_node_outputs_(y_grad_node_outputs),
+      x_grad_node_outputs_(x_grad_node_outputs),
+      graph_(graph) {
+  CHECK_EQ(y_node_outputs_.size(), y_grad_node_outputs.size());
+  x_grad_node_outputs_->clear();
+  x_grad_node_outputs_->resize(x_node_outputs_.size());
+  stop_nodes_.reserve(x_node_outputs_.size());
+  for (int i = 0; i < x_node_outputs_.size(); ++i) {
+    stop_nodes_.insert(std::make_pair(x_node_outputs_[i].node->id(), i));
  }
 }

-void SymbolicGradientBuilder::BackpropAlongEdge(const Endpoint& dst_grad,
-                                                const Endpoint& src) {
+void SymbolicGradientBuilder::BackpropAlongEdge(const NodeOut& dst_grad,
+                                                const NodeOut& src) {
  CHECK_NOTNULL(src.node);
  auto iter = backprops_.find(src);
  if (iter != backprops_.end()) {
@ -211,7 +207,7 @@ void SymbolicGradientBuilder::BackpropAlongEdge(const Endpoint& dst_grad,
  }
 }

-void SymbolicGradientBuilder::BackpropZerosAlongEdge(const Endpoint& src) {
+void SymbolicGradientBuilder::BackpropZerosAlongEdge(const NodeOut& src) {
  CHECK_NOTNULL(src.node);
  auto iter = backprops_.find(src);
  if (iter != backprops_.end()) {
@ -227,9 +223,9 @@ void SymbolicGradientBuilder::InitBackprop() {
    backprops_.clear();
    std::unordered_set<Node*> visited;
    std::deque<Node*> queue;
-    for (Node* n : x_nodes_) {
-      queue.push_back(n);
-      visited.insert(n);
+    for (const NodeOut& nout : x_node_outputs_) {
+      queue.push_back(nout.node);
+      visited.insert(nout.node);
    }

    // Going forward to figure out which endpoints need backprop-ed.
@ -255,20 +251,19 @@ void SymbolicGradientBuilder::InitBackprop() {
  }

  {
-    const int num_y = y_grad_nodes_.size();
+    const int num_y = y_grad_node_outputs_.size();
    for (int i = 0; i < num_y; ++i) {
-      Node* y = y_nodes_[i];
-      Node* dy = y_grad_nodes_[i];
+      Node* y = y_node_outputs_[i].node;
      for (const Edge* e : y->in_edges()) {
        if (e->IsControlEdge()) continue;
-        BackpropAlongEdge({dy, e->dst_input()}, {e->src(), e->src_output()});
+        BackpropAlongEdge(y_grad_node_outputs_[i], {e->src(), e->src_output()});
      }
    }
  }
  CHECK(!ready_.empty());
 }

-Endpoint SymbolicGradientBuilder::SumGradients(const Endpoint& src) {
+NodeOut SymbolicGradientBuilder::SumGradients(const NodeOut& src) {
  const DataType dtype = src.dtype();
  auto iter = backprops_.find(src);
  CHECK(iter != backprops_.end());
@ -286,8 +281,8 @@ Endpoint SymbolicGradientBuilder::SumGradients(const Endpoint& src) {
  NodeDef ndef;
  ndef.set_name(graph_->NewName(kNodeLabel));
  ndef.set_op("AddN");  // N-way Add
-  for (const Endpoint& ep : grads) {
-    ndef.add_input(ep.name());
+  for (const NodeOut& nout : grads) {
+    ndef.add_input(nout.name());
  }
  AddNodeAttr("N", static_cast<int64>(grads.size()), &ndef);
  AddNodeAttr("T", dtype, &ndef);
@ -295,8 +290,8 @@ Endpoint SymbolicGradientBuilder::SumGradients(const Endpoint& src) {
  Node* add = graph_->AddNode(ndef, &s);
  TF_CHECK_OK(s);
  for (size_t i = 0; i < grads.size(); ++i) {
-    const Endpoint& ep = grads[i];
-    graph_->AddEdge(ep.node, ep.index, add, i);
+    const NodeOut& nout = grads[i];
+    graph_->AddEdge(nout.node, nout.index, add, i);
  }
  return {add, 0};
 }
@ -312,7 +307,7 @@ Status SymbolicGradientBuilder::Compute() {
  InitBackprop();

  // Backward propagation.
-  gtl::InlinedVector<Endpoint, 8> dy;
+  gtl::InlinedVector<NodeOut, 8> dy;
  while (!ready_.empty()) {
    // n has collected all gradients.
    Node* n = ready_.front();
@ -324,11 +319,11 @@ Status SymbolicGradientBuilder::Compute() {

    auto iter = stop_nodes_.find(n->id());
    if (iter != stop_nodes_.end()) {
-      // Stop backprop and add gradient sum to 'x_grad_nodes'.
+      // Stop backprop and add gradient sum to 'x_grad_node_outputs_'.
      // TODO(andydavis) Support stop nodes with more than one output.
      CHECK_EQ(1, num_y);
-      Endpoint grad = SumGradients({n, 0});
-      (*x_grad_nodes_)[iter->second] = {grad.node, grad.index};
+      const int index = iter->second;
+      (*x_grad_node_outputs_)[index] = SumGradients(x_node_outputs_[index]);
      continue;
    }

@ -350,6 +345,7 @@ Status SymbolicGradientBuilder::Compute() {

    // Adds a gradient node with num_x + num_y inputs and num_x
    // outputs.
+    // TODO(andydavis) Support primitive gradient ops.
    Node* grad = AddSymGrad(graph_, n, dy);
    for (const Edge* e : n->in_edges()) {
      if (e->IsControlEdge()) continue;
@ -369,12 +365,13 @@ Status SymbolicGradientBuilder::Compute() {
  return Status::OK();
 }

-Status AddSymbolicGradients(gtl::ArraySlice<Node*> y_nodes,
-                            gtl::ArraySlice<Node*> x_nodes,
-                            gtl::ArraySlice<Node*> y_grad_nodes,
-                            std::vector<GradNodeOutput>* x_grad_nodes,
+Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs,
+                            gtl::ArraySlice<NodeOut> x_node_outputs,
+                            gtl::ArraySlice<NodeOut> y_grad_node_outputs,
+                            std::vector<NodeOut>* x_grad_node_outputs,
                            Graph* graph) {
-  SymbolicGradientBuilder builder(y_nodes, x_nodes, y_grad_nodes, x_grad_nodes,
+  SymbolicGradientBuilder builder(y_node_outputs, x_node_outputs,
+                                  y_grad_node_outputs, x_grad_node_outputs,
                                  graph);
  return builder.Compute();
 }
--- a/tensorflow/core/graph/gradients.h
+++ b/tensorflow/core/graph/gradients.h
@ -16,40 +16,41 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_

+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"

 namespace tensorflow {

-// GradNodeOutput represents a single gradient node output.
-struct GradNodeOutput {
+// Represents the output of 'node' at 'index'.
+struct NodeOut {
  Node* node;
  int index;
+
+  // Returns the string name that represents the output of this node.
+  string name() const;
+  // Returns the data type of the output of this node.
+  DataType dtype() const;
 };

 // NOTE: This API is a work in progress and will likely be changing frequently.
 //
-// Given initial gradient nodes 'y_grad_nodes' (which compute the symbolic
-// partial derivatives of some loss function 'L' w.r.t the inputs of each
-// node in 'y_nodes'), adds gradient nodes to 'graph' that compute the sum
-// of all gradients flowing into the single output of each node in 'x_nodes'.
-// Note that gradient nodes will not be added to 'graph' which compute
-// the symbolic partial derivative of 'L' w.r.t. each node in 'x_nodes' (i.e.
-// backprop will stop at these nodes). This restriction will be lifted in
-// a subsequent CL.
+// Given initial gradient-node outputs 'y_grad_node_outputs' (which compute the
+// symbolic partial derivatives of some loss function 'L' w.r.t the node outputs
+// 'y_node_outputs'), adds gradient nodes to 'graph' that compute the symbolic
+// partial derivatives of 'L' w.r.t the node outputs 'x_node_outputs'.
 //
-// REQUIRES: Each node in 'x_nodes' must have a single output (this
-// restriction will be removed in a subsequent change).
+// REQUIRES: Each node in 'x_node_outputs' to be unique, and so to have a single
+// output (this restriction will be removed in a subsequent change).

-// TODO(andydavis) Add support for returning 'x_node' gradients by endpoint
-// (i.e. {node, index}).
 // TODO(andydavis) Add symbolic gradient support for general graphs (the current
 // implementation only supports gradients for functions). In particular,
 // the nodes in 'x_nodes' are currently restricted to have one output.
-Status AddSymbolicGradients(gtl::ArraySlice<Node*> y_nodes,
-                            gtl::ArraySlice<Node*> x_nodes,
-                            gtl::ArraySlice<Node*> y_grad_nodes,
-                            std::vector<GradNodeOutput>* x_grad_nodes,
+
+Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs,
+                            gtl::ArraySlice<NodeOut> x_node_outputs,
+                            gtl::ArraySlice<NodeOut> y_grad_node_outputs,
+                            std::vector<NodeOut>* x_grad_node_outputs,
                            Graph* graph);

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -214,6 +214,21 @@ cc_header_only_library(
    deps = [":bounds_check"],
 )

+cc_library(
+    name = "image_resizer_state",
+    hdrs = ["image_resizer_state.h"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_header_only_library(
+    name = "image_resizer_state_lib",
+    deps = [":image_resizer_state"],
+)
+
 # OpKernel libraries ----------------------------------------------------------

 tf_kernel_libraries(
@ -221,7 +236,6 @@ tf_kernel_libraries(
    prefixes = [
        "bcast_ops",
        "bitcast_op",
-        "depthtospace_op",
        "concat_op",
        "constant_op",
        "diag_op",
@ -239,7 +253,6 @@ tf_kernel_libraries(
        "reverse_sequence_op",
        "shape_ops",
        "slice_op",
-        "spacetodepth_op",
        "split_op",
        "tile_ops",
        "transpose_op",
@ -250,6 +263,7 @@ tf_kernel_libraries(
    deps = [
        ":bounds_check",
        ":concat_lib",
+        ":depth_space_ops",
        ":fill_functor",
        ":ops_util",
        ":split_lib",
@ -545,6 +559,7 @@ tf_kernel_libraries(
        "sample_distorted_bounding_box_op",
    ],
    deps = [
+        ":image_resizer_state",
        "//tensorflow/core:framework",
        "//tensorflow/core:image_ops_op_lib",
        "//tensorflow/core:lib",
@ -830,6 +845,31 @@ tf_kernel_library(
    ],
 )

+tf_kernel_library(
+    name = "depth_space_ops",
+    srcs = [
+        "depthtospace_op.cc",
+        "spacetodepth_op.cc",
+    ],
+    hdrs = [
+        "depthtospace_op.h",
+        "spacetodepth_op.h",
+    ],
+    gpu_srcs = [
+        "depthtospace_op.h",
+        "depthtospace_op_gpu.cu.cc",
+        "spacetodepth_op.h",
+        "spacetodepth_op_gpu.cu.cc",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 0,
+)
+
 tf_kernel_libraries(
    name = "parsing",
    prefixes = [
@ -1062,6 +1102,7 @@ filegroup(
        "slice_op.h",
        "softmax_op.cc",
        "softmax_op.h",
+        "softmax_op_functor.h",
        "split_lib.h",
        "split_lib_cpu.cc",
        "split_op.cc",
@ -1095,10 +1136,12 @@ filegroup(
        "batch_norm_op.h",
        "control_flow_ops.h",
        "conv_2d.h",
+        "image_resizer_state.h",
        "maxpooling_op.h",
        "reduction_ops.h",
        "reduction_ops_common.h",
        "relu_op.h",
+        "relu_op_functor.h",
        "save_restore_tensor.h",
        "softplus_op.h",
        "softsign_op.h",
--- a/tensorflow/core/kernels/batch_matmul_op.cc
+++ b/tensorflow/core/kernels/batch_matmul_op.cc
@ -113,6 +113,39 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
  perftools::gputools::DeviceMemory<T> typed(wrapped);
  return typed;
 }
+
+class CublasScratchAllocator : public perftools::gputools::ScratchAllocator {
+ public:
+  using Stream = ::perftools::gputools::Stream;
+  using DeviceMemoryBytes = ::perftools::gputools::DeviceMemory<uint8>;
+
+  CublasScratchAllocator(OpKernelContext* context) : context_(context) {}
+
+  int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; }
+
+  perftools::gputools::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
+      Stream* stream, int64 byte_size) override {
+    Tensor temporary_memory;
+
+    Status allocation_status(context_->allocate_temp(
+        DT_UINT8, TensorShape({byte_size}), &temporary_memory));
+    if (!allocation_status.ok()) {
+      return perftools::gputools::port::StatusOr<DeviceMemoryBytes>(
+          DeviceMemoryBytes::MakeFromByteSize(nullptr, 0));
+    }
+    // Hold the reference of the allocated tensors until the end of the
+    // allocator.
+    allocated_tensors_.push_back(temporary_memory);
+    return perftools::gputools::port::StatusOr<DeviceMemoryBytes>(
+        DeviceMemoryBytes::MakeFromByteSize(
+            temporary_memory.flat<uint8>().data(),
+            temporary_memory.flat<uint8>().size()));
+  }
+
+ private:
+  OpKernelContext* context_;
+  std::vector<Tensor> allocated_tensors_;
+};
 }  // namespace

 template <typename Scalar>
@ -162,12 +195,14 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
    // where A, B and C are assumed to be in column major.
    // We want the output to be in row-major, so we can compute
    // C' = B' x A' (' stands for transpose)
+    CublasScratchAllocator scratch_allocator(context);
    bool blas_launch_status =
-        stream->ThenBlasGemmBatched(blas_transpose_b, blas_transpose_a, n, m, k,
-                                    static_cast<Scalar>(1.0), b_ptrs,
-                                    adj_y ? k : n, a_ptrs, adj_x ? m : k,
-                                    static_cast<Scalar>(0.0), c_ptrs, n,
-                                    batch_size)
+        stream
+            ->ThenBlasGemmBatchedWithScratch(
+                blas_transpose_b, blas_transpose_a, n, m, k,
+                static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n, batch_size,
+                &scratch_allocator)
            .ok();
    if (!blas_launch_status) {
      context->SetStatus(errors::Internal(
@ -265,9 +300,7 @@ REGISTER_CPU(int32);
 REGISTER_CPU(complex64);

 #ifdef GOOGLE_CUDA
-// TODO(kalakris): The GPU implementation is currently disabled due to issues
-// encountered in practice. See b/24534272.
-// REGISTER_GPU(float);
+REGISTER_GPU(float);
 #endif  // GOOGLE_CUDA

 #undef REGISTER_CPU
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@ -45,7 +45,7 @@ class DecodeCSVOp : public OpKernel {
    OP_REQUIRES_OK(ctx, ctx->input("records", &records));
    OP_REQUIRES_OK(ctx, ctx->input_list("record_defaults", &record_defaults));

-    for (int i = 0; i < record_defaults.size(); ++i) {
+    for (int64 i = 0; i < record_defaults.size(); ++i) {
      OP_REQUIRES(ctx, record_defaults[i].NumElements() < 2,
                  errors::InvalidArgument(
                      "There should only be 1 default per field but field ", i,
@ -53,7 +53,7 @@ class DecodeCSVOp : public OpKernel {
    }

    auto records_t = records->flat<string>();
-    int records_size = records_t.size();
+    int64 records_size = records_t.size();

    OpOutputList output;
    OP_REQUIRES_OK(ctx, ctx->output_list("output", &output));
@ -63,7 +63,7 @@ class DecodeCSVOp : public OpKernel {
      output.allocate(i, records->shape(), &out);
    }

-    for (int i = 0; i < records_size; ++i) {
+    for (int64 i = 0; i < records_size; ++i) {
      const StringPiece record(records_t(i));
      std::vector<string> fields;
      ExtractFields(ctx, record, &fields);
@ -165,7 +165,7 @@ class DecodeCSVOp : public OpKernel {

  void ExtractFields(OpKernelContext* ctx, StringPiece input,
                     std::vector<string>* result) {
-    int current_idx = 0;
+    int64 current_idx = 0;
    if (!input.empty()) {
      while (static_cast<size_t>(current_idx) < input.size()) {
        if (input[current_idx] == '\n' || input[current_idx] == '\r') {
--- a/tensorflow/core/kernels/depthtospace_op.cc
+++ b/tensorflow/core/kernels/depthtospace_op.cc
@ -21,6 +21,8 @@ limitations under the License.
 #include <string>
 #include <utility>

+#include "tensorflow/core/kernels/depthtospace_op.h"
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@ -60,8 +62,8 @@ class DepthToSpaceOp : public OpKernel {
                                        "instead of: ", dims));

    const int batch_size = input.dim_size(0);
-    const int height = input.dim_size(1);
-    const int width = input.dim_size(2);
+    const int input_height = input.dim_size(1);
+    const int input_width = input.dim_size(2);
    const int input_depth = input.dim_size(3);

    const int block_size_sq = block_size_ * block_size_;
@ -73,41 +75,58 @@ class DepthToSpaceOp : public OpKernel {
                                "should be divisible by: ", block_size_sq));

    const int output_depth = input_depth / block_size_sq;
-    const int output_width = width * block_size_;
-    const int output_height = height * block_size_;
+    const int output_width = input_width * block_size_;
+    const int output_height = input_height * block_size_;

    // Allocate output tensor.
-    Tensor* outputs_tensor = nullptr;
+    Tensor* output = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(
                                0, TensorShape({batch_size, output_height,
                                                output_width, output_depth}),
-                                &outputs_tensor));
+                                &output));

-    auto Toutput = outputs_tensor->tensor<T, 4>();
-    auto Tinput = input.tensor<T, 4>();
+    typename TTypes<T, 4>::ConstTensor Tinput = input.tensor<T, 4>();
+    typename TTypes<T, 4>::Tensor Toutput = output->tensor<T, 4>();

-    for (int b = 0; b < batch_size; ++b) {
-      for (int h = 0; h < output_height; ++h) {
-        const int in_h = h / block_size_;
-        const int offset_h = (h % block_size_);
-        for (int w = 0; w < output_width; ++w) {
-          const int in_w = w / block_size_;
-          const int offset_w = (w % block_size_);
-          const int offset_d =
-              (offset_h * block_size_ + offset_w) * output_depth;
-          for (int d = 0; d < output_depth; ++d) {
-            const int in_d = d + offset_d;
-            Toutput(b, h, w, d) = Tinput(b, in_h, in_w, in_d);
-          }
-        }
-      }
-    }
+    functor::DepthToSpaceOpFunctor<Device, T> functor;
+    functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput);
  };

 private:
  int block_size_;
 };

+// Partial specialization of DepthToSpaceOpFunctor for a CPUDevice.
+namespace functor {
+template <typename T>
+struct DepthToSpaceOpFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = output.dimension(0);
+    const int output_height = output.dimension(1);
+    const int output_width = output.dimension(2);
+    const int output_depth = output.dimension(3);
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int h = 0; h < output_height; ++h) {
+        const int in_h = h / block_size;
+        const int offset_h = (h % block_size);
+        for (int w = 0; w < output_width; ++w) {
+          const int in_w = w / block_size;
+          const int offset_w = (w % block_size);
+          const int offset_d =
+              (offset_h * block_size + offset_w) * output_depth;
+          for (int d = 0; d < output_depth; ++d) {
+            const int in_d = d + offset_d;
+            output(b, h, w, d) = input(b, in_h, in_w, in_d);
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace functor
+
 #define REGISTER(type)                                                   \
  REGISTER_KERNEL_BUILDER(                                               \
      Name("DepthToSpace").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
@ -116,4 +135,10 @@ class DepthToSpaceOp : public OpKernel {
 TF_CALL_ALL_TYPES(REGISTER);
 #undef REGISTER

+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("DepthToSpace").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    DepthToSpaceOp<GPUDevice, float>);
+#endif  // GOOGLE_CUDA
+
 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/depthtospace_op.h
+++ b/tensorflow/core/kernels/depthtospace_op.h
@ -0,0 +1,44 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_
+// Functor definition for XentOp, must be compilable by nvcc.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by DepthToSpaceOp to do the computations.
+template <typename Device, typename T>
+struct DepthToSpaceOpFunctor {
+  // Implements the depth to space conversion.
+  //
+  // input: 4-D input tensor.
+  // block_size: block size for the conversion.
+  // output: 4-D output tensor.
+  //
+  // The dimensions of the tensors are guaranteed to be correct when the
+  // functor is called.
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DEPTHTOSPACE_OP_H_
--- a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
@ -0,0 +1,88 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthtospace_op.h"
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename dtype>
+__global__ void D2S(const int32 nthreads, const dtype* input_ptr,
+                    const int block_size, const int batch_size,
+                    const int input_height, const int input_width,
+                    const int input_depth, const int output_height,
+                    const int output_width, const int output_depth,
+                    dtype* output_ptr) {
+  CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
+    // out_idx = d + output_depth * (w + output_width * (h + output_height * b))
+    const int d = out_idx % output_depth;
+    const int out_idx2 = out_idx / output_depth;
+    const int w = out_idx2 % output_width;
+    const int out_idx3 = out_idx2 / output_width;
+    const int h = out_idx3 % output_height;
+    const int b = out_idx3 / output_height;
+
+    const int in_h = h / block_size;
+    const int offset_h = h % block_size;
+    const int in_w = w / block_size;
+    const int offset_w = w % block_size;
+    const int offset_d = (offset_h * block_size + offset_w) * output_depth;
+    const int in_d = d + offset_d;
+    const int inp_idx =
+        in_d + input_depth * (in_w + input_width * (in_h + input_height * b));
+    *(output_ptr + out_idx) = ldg(input_ptr + inp_idx);
+  }
+}
+
+// Specialization of DepthToSpaceOpFunctor for a GPUDevice.
+namespace functor {
+template <typename T>
+struct DepthToSpaceOpFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = output.dimension(0);
+    const int input_height = input.dimension(1);
+    const int input_width = input.dimension(2);
+    const int input_depth = input.dimension(3);
+    const int output_height = output.dimension(1);
+    const int output_width = output.dimension(2);
+    const int output_depth = output.dimension(3);
+
+    const int total_count =
+        batch_size * output_height * output_width * output_depth;
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+    D2S<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        config.virtual_thread_count, input.data(), block_size, batch_size,
+        input_height, input_width, input_depth, output_height, output_width,
+        output_depth, output.data());
+  }
+};
+}  // end namespace functor
+
+// Instantiate the GPU implementation for float.
+template struct functor::DepthToSpaceOpFunctor<GPUDevice, float>;
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@ -0,0 +1,111 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a helper struct to package up the input and ouput
+// parameters of an image resizer (the height, widths, etc.).  To
+// reduce code duplication and ensure consistency across the different
+// resizers, it performs the input validation.
+
+#ifndef TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_
+#define TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_
+
+#define EIGEN_USE_THREADS
+
+#include <math.h>
+#include <algorithm>
+#include <array>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+
+namespace tensorflow {
+
+struct ImageResizerState {
+  explicit ImageResizerState(bool align_corners)
+      : align_corners_(align_corners) {}
+
+  // ValidateAndCreateOutput checks the bounds on the input tensors
+  // and requested size, sets up some of the resizing state such as the
+  // height_scale and width_scale, and allocates the output.
+  // If any of these operations fails, it sets an error status in
+  // the context, which the caller must check.
+  void ValidateAndCreateOutput(OpKernelContext* context, const Tensor& input) {
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+    const Tensor& shape_t = context->input(1);
+    OP_REQUIRES(context, shape_t.dims() == 1,
+                errors::InvalidArgument("shape_t must be 1-dimensional",
+                                        shape_t.shape().DebugString()));
+    OP_REQUIRES(context, shape_t.NumElements() == 2,
+                errors::InvalidArgument("shape_t must have two elements",
+                                        shape_t.shape().DebugString()));
+    auto Svec = shape_t.vec<int32>();
+    batch_size = input.dim_size(0);
+    out_height = internal::SubtleMustCopy(Svec(0));
+    out_width = internal::SubtleMustCopy(Svec(1));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.dim_size(1), std::numeric_limits<int32>::max()) &&
+            FastBoundsCheck(input.dim_size(2),
+                            std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("input sizes must be between 0 and max int32"));
+
+    in_height = static_cast<int32>(input.dim_size(1));
+    in_width = static_cast<int32>(input.dim_size(2));
+    channels = input.dim_size(3);
+    OP_REQUIRES(context, out_height > 0 && out_width > 0,
+                errors::InvalidArgument("output dimensions must be positive"));
+    OP_REQUIRES(
+        context, channels > 0,
+        errors::InvalidArgument("image must have at least one channel"));
+    OP_REQUIRES(
+        context, input.dim_size(1) > 0 && input.dim_size(2) > 0,
+        errors::InvalidArgument("input image must be of non-zero size"));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({input.dim_size(0), out_height,
+                                                out_width, input.dim_size(3)}),
+                                &output));
+
+    height_scale = (align_corners_ && out_height > 1)
+                       ? (in_height - 1) / static_cast<float>(out_height - 1)
+                       : in_height / static_cast<float>(out_height);
+    width_scale = (align_corners_ && out_width > 1)
+                      ? (in_width - 1) / static_cast<float>(out_width - 1)
+                      : in_width / static_cast<float>(out_width);
+  }
+
+  int64 batch_size;
+  int64 out_height;
+  int64 out_width;
+  int64 in_height;
+  int64 in_width;
+  int64 channels;
+  float height_scale;
+  float width_scale;
+  Tensor* output;
+
+ private:
+  bool align_corners_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@ -492,6 +492,8 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
 // OD: output_depth
 // KR: kernel_rows
 // KC: kernel_cols
+// STR: stride
+// PAD: padding

 #define BM_ConvFloatDepthwiseFwd(BS, R, C, ID, DM, OD, KR, KC, STR, PAD,    \
                                 LABEL)                                     \
@ -509,12 +511,25 @@ static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
        strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
                        KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));         \
  }                                                                         \
+  static void BM_ConvFloatDepthwiseFwdGPU_##LABEL(int iters) {              \
+    BM_ConvFloatDepthwise(                                                  \
+        iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \
+        PAD, true,                                                          \
+        strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
+                        KR, "_", KC, "_", STR, "_", PAD, "_gpu"));          \
+  }                                                                         \
  BENCHMARK(BM_ConvFloatDepthwiseFwdCPU1_##LABEL);                          \
-  BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL)
+  BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL);                          \
+  BENCHMARK(BM_ConvFloatDepthwiseFwdGPU_##LABEL);

-// TODO(andydavis,jmchen) Add more benchmarks.
+// The configurations below are mostly from mobilenet models.
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0);
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 64, 1, 64, 3, 3, 1, SAME, conv1);
+BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 1, SAME, conv2);
+BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 2, SAME, conv3);
+BM_ConvFloatDepthwiseFwd(32, 28, 28, 128, 1, 128, 3, 3, 1, SAME, conv4);
+BM_ConvFloatDepthwiseFwd(32, 14, 14, 512, 1, 512, 3, 3, 1, SAME, conv5);
+BM_ConvFloatDepthwiseFwd(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6);

 static void BM_LRNFloat(int iters, int depth, int cols, int rows,
                        int batch_size, int range, int num_threads,
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@ -30,147 +30,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;

-template <typename Device, typename T>
-class ReluOp : public UnaryElementWiseOp<T, ReluOp<Device, T>> {
- public:
-  using UnaryElementWiseOp<T, ReluOp<Device, T>>::UnaryElementWiseOp;
-
-  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
-    functor::Relu<Device, T> functor;
-    functor(context->eigen_device<Device>(), input.flat<T>(),
-            output->flat<T>());
-  }
-};
-
-// Out of line check to save code space (we have this code once, rather
-// than once for every NDIMS * NumTypes * Num_different_relu_variants
-// functions.
-static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
-                                   const Tensor& a) {
-  OP_REQUIRES(context, a.IsSameSize(g),
-              errors::InvalidArgument("g and a must be the same size"));
-}
-static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
-                             const Tensor& a) {
-  ValidateSameSizeHelper(context, g, a);
-  return context->status().ok();
-}
-
-template <typename Device, typename T>
-class ReluGradOp : public BinaryElementWiseOp<T, ReluGradOp<Device, T>> {
- public:
-  using BinaryElementWiseOp<T, ReluGradOp<Device, T>>::BinaryElementWiseOp;
-
-  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
-                         const Tensor& a, Tensor* output);
-
-  // INPUTS:
-  //   g (gradients): backpropagated gradients
-  //   a (inputs): either the inputs that were passed to ReluOp(), or its
-  //               outputs (using either one yields the same result here).
-  // OUTPUT:
-  //   gradients to backprop
-  template <int NDIMS>
-  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
-               Tensor* output) {
-    OperateNoTemplate(context, g, a, output);
-  }
-};
-
-template <typename Device, typename T>
-void ReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
-                                              const Tensor& g, const Tensor& a,
-                                              Tensor* output) {
-  if (!ValidateSameSize(context, g, a)) return;
-  functor::ReluGrad<Device, T> functor;
-  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
-          output->flat<T>());
-}
-
-template <typename Device, typename T>
-class Relu6Op : public UnaryElementWiseOp<T, Relu6Op<Device, T>> {
- public:
-  using UnaryElementWiseOp<T, Relu6Op<Device, T>>::UnaryElementWiseOp;
-
-  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
-    functor::Relu6<Device, T> functor;
-    functor(context->eigen_device<Device>(), input.flat<T>(),
-            output->flat<T>());
-  }
-};
-
-template <typename Device, typename T>
-class Relu6GradOp : public BinaryElementWiseOp<T, Relu6GradOp<Device, T>> {
- public:
-  using BinaryElementWiseOp<T, Relu6GradOp<Device, T>>::BinaryElementWiseOp;
-
-  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
-                         const Tensor& a, Tensor* output);
-
-  // INPUTS:
-  //   g (gradients): backpropagated gradients
-  //   a (inputs): inputs that were passed to Relu6Op()
-  // OUTPUT:
-  //   gradients to backprop
-  template <int NDIMS>
-  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
-               Tensor* output) {
-    OperateNoTemplate(context, g, a, output);
-  }
-};
-
-template <typename Device, typename T>
-void Relu6GradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
-                                               const Tensor& g, const Tensor& a,
-                                               Tensor* output) {
-  if (!ValidateSameSize(context, g, a)) return;
-  functor::Relu6Grad<Device, T> functor;
-  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
-          output->flat<T>());
-}
-
-template <typename Device, typename T>
-class EluOp : public UnaryElementWiseOp<T, EluOp<Device, T>> {
- public:
-  using UnaryElementWiseOp<T, EluOp<Device, T>>::UnaryElementWiseOp;
-
-  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
-    functor::Elu<Device, T> functor;
-    functor(context->eigen_device<Device>(), input.flat<T>(),
-            output->flat<T>());
-  }
-};
-
-template <typename Device, typename T>
-class EluGradOp : public BinaryElementWiseOp<T, EluGradOp<Device, T>> {
- public:
-  using BinaryElementWiseOp<T, EluGradOp<Device, T>>::BinaryElementWiseOp;
-
-  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
-                         const Tensor& a, Tensor* output);
-
-  // INPUTS:
-  //   g (gradients): backpropagated gradients
-  //   a (outputs): outputs of the EluOp()
-  // OUTPUT:
-  //   gradients to backprop
-  template <int NDIMS>
-  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
-               Tensor* output) {
-    OperateNoTemplate(context, g, a, output);
-  }
-};
-
-template <typename Device, typename T>
-void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
-                                             const Tensor& g, const Tensor& a,
-                                             Tensor* output) {
-  if (!ValidateSameSize(context, g, a)) return;
-  functor::EluGrad<Device, T> functor;
-  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
-          output->flat<T>());
-}
-
 #define REGISTER_RELU_KERNELS(type)                                   \
  REGISTER_KERNEL_BUILDER(                                            \
      Name("Relu").Device(DEVICE_CPU).TypeConstraint<type>("T"),      \
--- a/tensorflow/core/kernels/relu_op.h
+++ b/tensorflow/core/kernels/relu_op.h
@ -13,118 +13,168 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+// See docs in ../ops/nn_ops.cc.
+
 #ifndef TENSORFLOW_KERNELS_RELU_OP_H_
 #define TENSORFLOW_KERNELS_RELU_OP_H_
-// Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc.
+
+#define EIGEN_USE_THREADS

 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/relu_op_functor.h"
+#include "tensorflow/core/lib/core/errors.h"

 namespace tensorflow {
-namespace functor {

-// Functor used by ReluOp to do the computations.
 template <typename Device, typename T>
-struct Relu {
-  // Computes Relu activation.
-  //
-  // features: any shape.
-  // activations: same shape as "features".
-  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
-                  typename TTypes<T>::Tensor activations) {
-    activations.device(d) = features.cwiseMax(static_cast<T>(0));
+class ReluOp : public UnaryElementWiseOp<T, ReluOp<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, ReluOp<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Relu<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
  }
 };

-// Functor used by ReluGradOp to do the computations.
-template <typename Device, typename T>
-struct ReluGrad {
-  // Computes ReluGrad backprops.
-  //
-  // gradients: gradients backpropagated to the Relu op.
-  // features: either the inputs that were passed to the Relu or, or its
-  //           outputs (using either one yields the same result here).
-  // backprops: gradients to backpropagate to the Relu inputs.
-  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
-                  typename TTypes<T>::ConstTensor features,
-                  typename TTypes<T>::Tensor backprops) {
-    // NOTE: When the activation is exactly zero, we do not propagate the
-    // associated gradient value. This allows the output of the Relu to be used,
-    // as well as its input.
-    backprops.device(d) =
-        gradients * (features > features.constant(static_cast<T>(0)));
+// Out of line check to save code space (we have this code once, rather
+// than once for every NDIMS * NumTypes * Num_different_relu_variants
+// functions.
+struct ReluHelpers {
+  static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
+                                     const Tensor& a) {
+    OP_REQUIRES(context, a.IsSameSize(g),
+                errors::InvalidArgument("g and a must be the same size"));
+  }
+  static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
+                               const Tensor& a) {
+    ValidateSameSizeHelper(context, g, a);
+    return context->status().ok();
  }
 };

-// Functor used by Relu6Op to do the computations.
 template <typename Device, typename T>
-struct Relu6 {
-  // Computes Relu6 activation.
-  //
-  // features: any shape.
-  // activations: same shape as "features".
-  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
-                  typename TTypes<T>::Tensor activations) {
-    activations.device(d) =
-        features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6));
+class ReluGradOp : public BinaryElementWiseOp<T, ReluGradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, ReluGradOp<Device, T>>::BinaryElementWiseOp;
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): either the inputs that were passed to ReluOp(), or its
+  //               outputs (using either one yields the same result here).
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, output);
  }
 };

-// Functor used by ReluGradOp to do the computations.
 template <typename Device, typename T>
-struct Relu6Grad {
-  // Computes Relu6Grad backprops.
-  //
-  // gradients: gradients backpropagated to the Relu6 op.
-  // features: inputs that where passed to the Relu6 op.
-  // backprops: gradients to backpropagate to the Relu6 inputs.
-  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
-                  typename TTypes<T>::ConstTensor features,
-                  typename TTypes<T>::Tensor backprops) {
-    // NOTE: When the activation is exactly zero or six, we
-    // arbitrarily choose to not propagate the associated gradient
-    // value.
-    backprops.device(d) = gradients *
-                          (features > features.constant(static_cast<T>(0))) *
-                          (features < features.constant(static_cast<T>(6)));
+void ReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+                                              const Tensor& g, const Tensor& a,
+                                              Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::ReluGrad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+          output->flat<T>());
+}
+
+template <typename Device, typename T>
+class Relu6Op : public UnaryElementWiseOp<T, Relu6Op<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, Relu6Op<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Relu6<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
  }
 };

-// Functor used by EluOp to do the computations.
 template <typename Device, typename T>
-struct Elu {
-  // Computes Elu activation.
-  //
-  // features: any shape.
-  // activations: same shape as "features".
-  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
-                  typename TTypes<T>::Tensor activations) {
-    // features.constant(?)
-    activations.device(d) =
-        (features < static_cast<T>(0))
-            .select(features.exp() - features.constant(static_cast<T>(1)),
-                    features);
+class Relu6GradOp : public BinaryElementWiseOp<T, Relu6GradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, Relu6GradOp<Device, T>>::BinaryElementWiseOp;
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): inputs that were passed to Relu6Op()
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, output);
  }
 };

-// Functor used by EluGradOp to do the computations.
 template <typename Device, typename T>
-struct EluGrad {
-  // Computes EluGrad backprops.
-  //
-  // gradients: gradients backpropagated to the Elu op.
-  // activations: outputs of the Elu op.
-  // backprops: gradients to backpropagate to the Elu inputs.
-  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
-                  typename TTypes<T>::ConstTensor activations,
-                  typename TTypes<T>::Tensor backprops) {
-    backprops.device(d) =
-        (activations < static_cast<T>(0))
-            .select((activations + static_cast<T>(1)) * gradients, gradients);
+void Relu6GradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+                                               const Tensor& g, const Tensor& a,
+                                               Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::Relu6Grad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+          output->flat<T>());
+}
+
+template <typename Device, typename T>
+class EluOp : public UnaryElementWiseOp<T, EluOp<Device, T>> {
+ public:
+  using UnaryElementWiseOp<T, EluOp<Device, T>>::UnaryElementWiseOp;
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::Elu<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(),
+            output->flat<T>());
  }
 };

-}  // namespace functor
+template <typename Device, typename T>
+class EluGradOp : public BinaryElementWiseOp<T, EluGradOp<Device, T>> {
+ public:
+  using BinaryElementWiseOp<T, EluGradOp<Device, T>>::BinaryElementWiseOp;
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (outputs): outputs of the EluOp()
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, output);
+  }
+};
+
+template <typename Device, typename T>
+void EluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+                                             const Tensor& g, const Tensor& a,
+                                             Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::EluGrad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(),
+          output->flat<T>());
+}
+
 }  // namespace tensorflow

+#undef EIGEN_USE_THREADS
+
 #endif  // TENSORFLOW_KERNELS_RELU_OP_H_
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@ -0,0 +1,130 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_
+#define TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_
+// Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by ReluOp to do the computations.
+template <typename Device, typename T>
+struct Relu {
+  // Computes Relu activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    activations.device(d) = features.cwiseMax(static_cast<T>(0));
+  }
+};
+
+// Functor used by ReluGradOp to do the computations.
+template <typename Device, typename T>
+struct ReluGrad {
+  // Computes ReluGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Relu op.
+  // features: either the inputs that were passed to the Relu or, or its
+  //           outputs (using either one yields the same result here).
+  // backprops: gradients to backpropagate to the Relu inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor backprops) {
+    // NOTE: When the activation is exactly zero, we do not propagate the
+    // associated gradient value. This allows the output of the Relu to be used,
+    // as well as its input.
+    backprops.device(d) =
+        gradients * (features > features.constant(static_cast<T>(0)));
+  }
+};
+
+// Functor used by Relu6Op to do the computations.
+template <typename Device, typename T>
+struct Relu6 {
+  // Computes Relu6 activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    activations.device(d) =
+        features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6));
+  }
+};
+
+// Functor used by ReluGradOp to do the computations.
+template <typename Device, typename T>
+struct Relu6Grad {
+  // Computes Relu6Grad backprops.
+  //
+  // gradients: gradients backpropagated to the Relu6 op.
+  // features: inputs that where passed to the Relu6 op.
+  // backprops: gradients to backpropagate to the Relu6 inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor backprops) {
+    // NOTE: When the activation is exactly zero or six, we
+    // arbitrarily choose to not propagate the associated gradient
+    // value.
+    backprops.device(d) = gradients *
+                          (features > features.constant(static_cast<T>(0))) *
+                          (features < features.constant(static_cast<T>(6)));
+  }
+};
+
+// Functor used by EluOp to do the computations.
+template <typename Device, typename T>
+struct Elu {
+  // Computes Elu activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  typename TTypes<T>::Tensor activations) {
+    // features.constant(?)
+    activations.device(d) =
+        (features < static_cast<T>(0))
+            .select(features.exp() - features.constant(static_cast<T>(1)),
+                    features);
+  }
+};
+
+// Functor used by EluGradOp to do the computations.
+template <typename Device, typename T>
+struct EluGrad {
+  // Computes EluGrad backprops.
+  //
+  // gradients: gradients backpropagated to the Elu op.
+  // activations: outputs of the Elu op.
+  // backprops: gradients to backpropagate to the Elu inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor activations,
+                  typename TTypes<T>::Tensor backprops) {
+    backprops.device(d) =
+        (activations < static_cast<T>(0))
+            .select((activations + static_cast<T>(1)) * gradients, gradients);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@ -19,7 +19,7 @@ limitations under the License.

 #include <stdio.h>

-#include "tensorflow/core/kernels/relu_op.h"
+#include "tensorflow/core/kernels/relu_op_functor.h"

 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
--- a/tensorflow/core/kernels/resize_area_op.cc
+++ b/tensorflow/core/kernels/resize_area_op.cc
@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"

@ -40,49 +41,22 @@ class ResizeAreaOp : public OpKernel {

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
-    OP_REQUIRES(context, input.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input.shape().DebugString()));
-    const Tensor& shape_t = context->input(1);
-    OP_REQUIRES(context, shape_t.dims() == 1,
-                errors::InvalidArgument("shape_t must be 1-dimensional",
-                                        shape_t.shape().DebugString()));
-    OP_REQUIRES(context, shape_t.NumElements() == 2,
-                errors::InvalidArgument("shape_t must have two elements",
-                                        shape_t.shape().DebugString()));
+    ImageResizerState st(align_corners_);
+    st.ValidateAndCreateOutput(context, input);

-    auto Svec = shape_t.vec<int32>();
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({input.dim_size(0), Svec(0),
-                                                Svec(1), input.dim_size(3)}),
-                                &output));
-    const int64 batch_size = input.dim_size(0);
-    const int64 in_height = input.dim_size(1);
-    const int64 in_width = input.dim_size(2);
-    const int64 channels = input.dim_size(3);
-    const int64 out_height = output->dim_size(1);
-    const int64 out_width = output->dim_size(2);
+    if (!context->status().ok()) return;

    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
-    typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
+    typename TTypes<float, 4>::Tensor output_data =
+        st.output->tensor<float, 4>();

    // A temporary tensor for computing the sum.
    Tensor sum_tensor;
-    OP_REQUIRES_OK(
-        context, context->allocate_temp(DataTypeToEnum<float>::value,
-                                        TensorShape({channels}), &sum_tensor));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::value,
+                                                   TensorShape({st.channels}),
+                                                   &sum_tensor));
    typename TTypes<float, 1>::Tensor sum_data = sum_tensor.vec<float>();

-    const float height_scale =
-        (align_corners_ && out_height > 1)
-            ? (in_height - 1) / static_cast<float>(out_height - 1)
-            : in_height / static_cast<float>(out_height);
-    const float width_scale =
-        (align_corners_ && out_width > 1)
-            ? (in_width - 1) / static_cast<float>(out_width - 1)
-            : in_width / static_cast<float>(out_width);
-
    // When using this algorithm for downsizing, the target pixel value is the
    // weighted average of all the source pixels. The weight is determined by
    // the contribution percentage of the source pixel.
@ -102,19 +76,19 @@ class ResizeAreaOp : public OpKernel {
    //   out[0] = (in[0] * 1.0 + in[1] * 1/3) * scale
    //   out[1] = (in[1] * 2/3 + in[2] * 2/3 * scale
    //   out[2] = (in[3] * 1/3 + in[3] * 1.0) * scale
-    float scale = 1.0 / (height_scale * width_scale);
-    for (int64 b = 0; b < batch_size; ++b) {
-      for (int64 y = 0; y < out_height; ++y) {
-        const float in_y = y * height_scale;
-        const float in_y1 = (y + 1) * height_scale;
+    float scale = 1.0 / (st.height_scale * st.width_scale);
+    for (int64 b = 0; b < st.batch_size; ++b) {
+      for (int64 y = 0; y < st.out_height; ++y) {
+        const float in_y = y * st.height_scale;
+        const float in_y1 = (y + 1) * st.height_scale;
        // The start and end height indices of all the cells that could
        // contribute to the target cell.
        int64 y_start = floor(in_y);
        int64 y_end = ceil(in_y1);

-        for (int64 x = 0; x < out_width; ++x) {
-          const float in_x = x * width_scale;
-          const float in_x1 = (x + 1) * width_scale;
+        for (int64 x = 0; x < st.out_width; ++x) {
+          const float in_x = x * st.width_scale;
+          const float in_x1 = (x + 1) * st.width_scale;
          // The start and end width indices of all the cells that could
          // contribute to the target cell.
          int64 x_start = floor(in_x);
@ -127,16 +101,16 @@ class ResizeAreaOp : public OpKernel {
            for (int64 j = x_start; j < x_end; ++j) {
              float scale_x =
                  j < in_x ? j + 1 - in_x : (j + 1 > in_x1 ? in_x1 - j : 1.0);
-              for (int64 c = 0; c < channels; ++c) {
+              for (int64 c = 0; c < st.channels; ++c) {
 #define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val))))
-                sum_data(c) +=
-                    input_data(b, BOUND(i, in_height), BOUND(j, in_width), c) *
-                    scale_y * scale_x * scale;
+                sum_data(c) += input_data(b, BOUND(i, st.in_height),
+                                          BOUND(j, st.in_width), c) *
+                               scale_y * scale_x * scale;
 #undef BOUND
              }
            }
          }
-          for (int64 c = 0; c < channels; ++c) {
+          for (int64 c = 0; c < st.channels; ++c) {
            output_data(b, y, x, c) = sum_data(c);
          }
        }
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"

@ -92,62 +93,28 @@ class ResizeBicubicOp : public OpKernel {

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
-    OP_REQUIRES(context, input.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input.shape().DebugString()));
-    const Tensor& shape_t = context->input(1);
-    OP_REQUIRES(context, shape_t.dims() == 1,
-                errors::InvalidArgument("shape_t must be 1-dimensional",
-                                        shape_t.shape().DebugString()));
-    OP_REQUIRES(context, shape_t.NumElements() == 2,
-                errors::InvalidArgument("shape_t must have two elements",
-                                        shape_t.shape().DebugString()));
+    ImageResizerState st(align_corners_);
+    st.ValidateAndCreateOutput(context, input);

-    auto Svec = shape_t.vec<int32>();
-    // Initialize shape to the batch size of the input, then add
-    // the rest of the dimensions
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({input.dim_size(0), Svec(0),
-                                                Svec(1), input.dim_size(3)}),
-                                &output));
-    const int64 batch_size = input.dim_size(0);
-    const int64 in_height = input.dim_size(1);
-    const int64 in_width = input.dim_size(2);
-    const int64 channels = input.dim_size(3);
-    const int64 out_height = output->dim_size(1);
-    const int64 out_width = output->dim_size(2);
-    CHECK_GT(in_height, 0);
-    CHECK_GT(in_width, 0);
-    CHECK_GT(channels, 0);
-    CHECK_GT(out_height, 0);
-    CHECK_GT(out_width, 0);
+    if (!context->status().ok()) return;

    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
-    typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
-
-    const float height_scale =
-        (align_corners_ && out_height > 1)
-            ? (in_height - 1) / static_cast<float>(out_height - 1)
-            : in_height / static_cast<float>(out_height);
-    const float width_scale =
-        (align_corners_ && out_width > 1)
-            ? (in_width - 1) / static_cast<float>(out_width - 1)
-            : in_width / static_cast<float>(out_width);
+    typename TTypes<float, 4>::Tensor output_data =
+        st.output->tensor<float, 4>();

    std::array<float, 4> coeff = {{0.0, 0.0, 0.0, 0.0}};
-    for (int64 b = 0; b < batch_size; ++b) {
-      for (int64 y = 0; y < out_height; ++y) {
+    for (int64 b = 0; b < st.batch_size; ++b) {
+      for (int64 y = 0; y < st.out_height; ++y) {
        std::array<float, 4> y_weights;
        std::array<int64, 4> y_indices;
-        GetWeightsAndIndices(height_scale, y, in_height, &y_weights,
+        GetWeightsAndIndices(st.height_scale, y, st.in_height, &y_weights,
                             &y_indices);
-        for (int64 x = 0; x < out_width; ++x) {
+        for (int64 x = 0; x < st.out_width; ++x) {
          std::array<float, 4> x_weights;
          std::array<int64, 4> x_indices;
-          GetWeightsAndIndices(width_scale, x, in_width, &x_weights,
+          GetWeightsAndIndices(st.width_scale, x, st.in_width, &x_weights,
                               &x_indices);
-          for (int64 c = 0; c < channels; ++c) {
+          for (int64 c = 0; c < st.channels; ++c) {
            // Use a 4x4 patch to compute the interpolated output value at
            // (b, y, x, c).
            for (int64 i = 0; i < 4; ++i) {
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"

@ -39,64 +40,29 @@ class ResizeBilinearOp : public OpKernel {

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
-    OP_REQUIRES(context, input.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input.shape().DebugString()));
-    const Tensor& shape_t = context->input(1);
-    OP_REQUIRES(context, shape_t.dims() == 1,
-                errors::InvalidArgument("shape_t must be 1-dimensional",
-                                        shape_t.shape().DebugString()));
-    OP_REQUIRES(context, shape_t.NumElements() == 2,
-                errors::InvalidArgument("shape_t must have two elements",
-                                        shape_t.shape().DebugString()));
+    ImageResizerState st(align_corners_);
+    st.ValidateAndCreateOutput(context, input);

-    auto Svec = shape_t.vec<int32>();
-    // Initialize shape to the batch size of the input, then add
-    // the rest of the dimensions
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({input.dim_size(0), Svec(0),
-                                                Svec(1), input.dim_size(3)}),
-                                &output));
-
-    const int64 batch_size = input.dim_size(0);
-    const int64 in_height = input.dim_size(1);
-    const int64 in_width = input.dim_size(2);
-    const int64 channels = input.dim_size(3);
-    const int64 out_height = output->dim_size(1);
-    const int64 out_width = output->dim_size(2);
-    CHECK_GT(in_height, 0);
-    CHECK_GT(in_width, 0);
-    CHECK_GT(channels, 0);
-    CHECK_GT(out_height, 0);
-    CHECK_GT(out_width, 0);
+    if (!context->status().ok()) return;

    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
-    typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
+    typename TTypes<float, 4>::Tensor output_data =
+        st.output->tensor<float, 4>();

-    const float height_scale =
-        (align_corners_ && out_height > 1)
-            ? (in_height - 1) / static_cast<float>(out_height - 1)
-            : in_height / static_cast<float>(out_height);
-    const float width_scale =
-        (align_corners_ && out_width > 1)
-            ? (in_width - 1) / static_cast<float>(out_width - 1)
-            : in_width / static_cast<float>(out_width);
-
-    for (int b = 0; b < batch_size; ++b) {
-      for (int y = 0; y < out_height; ++y) {
-        const float in_y = y * height_scale;
+    for (int b = 0; b < st.batch_size; ++b) {
+      for (int y = 0; y < st.out_height; ++y) {
+        const float in_y = y * st.height_scale;
        const int top_y_index = static_cast<int>(floorf(in_y));
        const int bottom_y_index =
-            std::min(static_cast<int64>(ceilf(in_y)), (in_height - 1));
+            std::min(static_cast<int64>(ceilf(in_y)), (st.in_height - 1));
        const float y_lerp = in_y - top_y_index;
-        for (int x = 0; x < out_width; ++x) {
-          const float in_x = x * width_scale;
+        for (int x = 0; x < st.out_width; ++x) {
+          const float in_x = x * st.width_scale;
          const int left_x_index = static_cast<int>(floorf(in_x));
          const int right_x_index =
-              std::min(static_cast<int64>(ceilf(in_x)), (in_width - 1));
+              std::min(static_cast<int64>(ceilf(in_x)), (st.in_width - 1));
          const float x_lerp = in_x - left_x_index;
-          for (int c = 0; c < channels; ++c) {
+          for (int c = 0; c < st.channels; ++c) {
            const float top_left = input_data(b, top_y_index, left_x_index, c);
            const float top_right =
                input_data(b, top_y_index, right_x_index, c);
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"

@ -44,56 +45,28 @@ class ResizeNearestNeighborOp : public OpKernel {

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
-    OP_REQUIRES(context, input.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input.shape().DebugString()));
-    const Tensor& shape_t = context->input(1);
-    OP_REQUIRES(context, shape_t.dims() == 1,
-                errors::InvalidArgument("shape_t must be 1-dimensional",
-                                        shape_t.shape().DebugString()));
-    OP_REQUIRES(context, shape_t.NumElements() == 2,
-                errors::InvalidArgument("shape_t must have two elements",
-                                        shape_t.shape().DebugString()));
+    ImageResizerState st(align_corners_);
+    st.ValidateAndCreateOutput(context, input);

-    auto sizes = shape_t.vec<int32>();
-    OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0,
-                errors::InvalidArgument("shape_t's elements must be positive"));
+    if (!context->status().ok()) return;

-    // Initialize shape to the batch size of the input, then add
-    // the rest of the dimensions
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(0, TensorShape({input.dim_size(0), sizes(0),
-                                                          sizes(1), input.dim_size(3)}),
-                                          &output));
-
-    const int64 batch_size = input.dim_size(0);
-    const int64 in_height = input.dim_size(1);
-    const int64 in_width = input.dim_size(2);
-    const int64 channels = input.dim_size(3);
-    const int64 out_height = output->dim_size(1);
-    const int64 out_width = output->dim_size(2);
+    OP_REQUIRES(context, st.in_height < (1 << 24) && st.in_width < (1 << 24),
+                errors::InvalidArgument("nearest neighbor requires max height "
+                                        "& width of 2^24"));

    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
-    typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
+    typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>();

-    const float height_scale =
-        (align_corners_ && out_height > 1)
-            ? (in_height - 1) / static_cast<float>(out_height - 1)
-            : in_height / static_cast<float>(out_height);
-    const float width_scale =
-        (align_corners_ && out_width > 1)
-            ? (in_width - 1) / static_cast<float>(out_width - 1)
-            : in_width / static_cast<float>(out_width);
-
-    for (int b = 0; b < batch_size; ++b) {
-      for (int y = 0; y < out_height; ++y) {
-        const int in_y = std::min(static_cast<int64>(floorf(y * height_scale)),
-                                  (in_height - 1));
-        for (int x = 0; x < out_width; ++x) {
-          const int in_x = std::min(static_cast<int64>(floorf(x * width_scale)),
-                                    (in_width - 1));
-          for (int c = 0; c < channels; ++c) {
+    for (int b = 0; b < st.batch_size; ++b) {
+      for (int y = 0; y < st.out_height; ++y) {
+        const int in_y =
+            std::min(static_cast<int64>(floorf(y * st.height_scale)),
+                     (st.in_height - 1));
+        for (int x = 0; x < st.out_width; ++x) {
+          const int in_x =
+              std::min(static_cast<int64>(floorf(x * st.width_scale)),
+                       (st.in_width - 1));
+          for (int c = 0; c < st.channels; ++c) {
            output_data(b, y, x, c) = input_data(b, in_y, in_x, c);
          }
        }
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@ -28,29 +28,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;

-template <typename Device, typename T>
-class SoftmaxOp : public OpKernel {
- public:
-  explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = StringPiece(name()).starts_with("Log");
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& logits_in = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
-                errors::InvalidArgument("logits must be 2-dimensional"));
-    Tensor* softmax_out = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(0, logits_in.shape(), &softmax_out));
-    functor::SoftmaxFunctor<Device, T> functor;
-    functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
-            softmax_out->matrix<T>(), log_);
-  }
-
- private:
-  bool log_;
-};
-
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from SoftmaxEigenImpl.
 namespace functor {
--- a/tensorflow/core/kernels/softmax_op.h
+++ b/tensorflow/core/kernels/softmax_op.h
@ -13,89 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+// See docs in ../ops/nn_ops.cc.
+
 #ifndef TENSORFLOW_KERNELS_SOFTMAX_OP_H_
 #define TENSORFLOW_KERNELS_SOFTMAX_OP_H_
-// Functor definition for SoftmaxOp, must be compilable by nvcc.
+
+#define EIGEN_USE_THREADS

 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/softmax_op_functor.h"

 namespace tensorflow {
-namespace functor {

-// Functor used by SoftmaxOp to do the computations.
 template <typename Device, typename T>
-struct SoftmaxFunctor {
-  // Computes Softmax or LogSoftmax activation.
-  //
-  // logits: dim: batch_size, num_classes.
-  // softmax: dims: batch_size, num_classes.
-  // log: boolean
-  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
-                  typename TTypes<T>::Matrix softmax, const bool log);
-};
+class SoftmaxOp : public OpKernel {
+ public:
+  explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {
+    log_ = StringPiece(name()).starts_with("Log");
+  }

-// Eigen code implementing SoftmaxFunctor::operator() or
-// LogSoftmaxFunctor::operator().
-// This code works for both CPU and GPU and is used by the functor
-// specializations for both device types.
-template <typename Device, typename T>
-struct SoftmaxEigenImpl {
-  static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
-                      typename TTypes<T>::Matrix softmax, const bool log) {
-    const int kBatchDim = 0;
-    const int kClassDim = 1;
-
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
-
-// These arrays are used to reduce along the class dimension, and broadcast
-// the resulting value to all classes.
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-#else
-    Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
-    Eigen::IndexList<Eigen::type2index<1> > depth_dim;
-    Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
-    batch_by_one.set(0, batch_size);
-    Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
-    one_by_class.set(1, num_classes);
-#endif
-    //shifted_logits = logits - max(logits along classes);
-    auto shifted_logits = (logits - logits.maximum(along_class)
-                                      .eval()
-                                      .reshape(batch_by_one)
-                                      .broadcast(one_by_class));
-    if (log) {
-      // Calculate the log of the softmax
-      // softmax = logits - max(logits along classes);
-      softmax.device(d) = shifted_logits;
-      // softmax = softmax - log(sum(exp(softmax along classes)));
-      softmax.device(d) = (softmax -
-                           softmax.exp().sum(along_class)
-                              .eval()
-                              .reshape(batch_by_one)
-                              .broadcast(one_by_class)
-                              .log());
-    } else {
-      // NOTE(touts): If you modify this implementation please run
-      // the BM_ImageNetSoftmaxFwd benchmark in nn_ops_test.cc.
-      //
-      // softmax = exp(logits - max(logits along classes));
-      softmax.device(d) = shifted_logits.exp();
-      // softmax = softmax / sum(softmax along classes);
-      softmax.device(d) = (softmax /
-                           softmax.sum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
+  void Compute(OpKernelContext* context) override {
+    const Tensor& logits_in = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
+                errors::InvalidArgument("logits must be 2-dimensional"));
+    Tensor* softmax_out = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, logits_in.shape(), &softmax_out));
+    if (logits_in.NumElements()) {
+      functor::SoftmaxFunctor<Device, T> functor;
+      functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
+              softmax_out->matrix<T>(), log_);
    }
  }
+
+ private:
+  bool log_;
 };

-}  // namespace functor
 }  // namespace tensorflow

+#undef EIGEN_USE_THREADS
+
 #endif  // TENSORFLOW_KERNELS_SOFTMAX_OP_H_
--- a/tensorflow/core/kernels/softmax_op_functor.h
+++ b/tensorflow/core/kernels/softmax_op_functor.h
@ -0,0 +1,101 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_
+#define TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_
+// Functor definition for SoftmaxOp, must be compilable by nvcc.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SoftmaxOp to do the computations.
+template <typename Device, typename T>
+struct SoftmaxFunctor {
+  // Computes Softmax or LogSoftmax activation.
+  //
+  // logits: dim: batch_size, num_classes.
+  // softmax: dims: batch_size, num_classes.
+  // log: boolean
+  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+                  typename TTypes<T>::Matrix softmax, const bool log);
+};
+
+// Eigen code implementing SoftmaxFunctor::operator() or
+// LogSoftmaxFunctor::operator().
+// This code works for both CPU and GPU and is used by the functor
+// specializations for both device types.
+template <typename Device, typename T>
+struct SoftmaxEigenImpl {
+  static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+                      typename TTypes<T>::Matrix softmax, const bool log) {
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+// These arrays are used to reduce along the class dimension, and broadcast
+// the resulting value to all classes.
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+#else
+    Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
+    Eigen::IndexList<Eigen::type2index<1> > depth_dim;
+    Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
+    batch_by_one.set(0, batch_size);
+    Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
+    one_by_class.set(1, num_classes);
+#endif
+    //shifted_logits = logits - max(logits along classes);
+    auto shifted_logits = (logits - logits.maximum(along_class)
+                                      .eval()
+                                      .reshape(batch_by_one)
+                                      .broadcast(one_by_class));
+    if (log) {
+      // Calculate the log of the softmax
+      // softmax = logits - max(logits along classes);
+      softmax.device(d) = shifted_logits;
+      // softmax = softmax - log(sum(exp(softmax along classes)));
+      softmax.device(d) = (softmax -
+                           softmax.exp().sum(along_class)
+                              .eval()
+                              .reshape(batch_by_one)
+                              .broadcast(one_by_class)
+                              .log());
+    } else {
+      // NOTE(touts): If you modify this implementation please run
+      // the BM_ImageNetSoftmaxFwd benchmark in nn_ops_test.cc.
+      //
+      // softmax = exp(logits - max(logits along classes));
+      softmax.device(d) = shifted_logits.exp();
+      // softmax = softmax / sum(softmax along classes);
+      softmax.device(d) = (softmax /
+                           softmax.sum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class));
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@ -17,7 +17,7 @@ limitations under the License.

 #define EIGEN_USE_GPU

-#include "tensorflow/core/kernels/softmax_op.h"
+#include "tensorflow/core/kernels/softmax_op_functor.h"

 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
--- a/tensorflow/core/kernels/spacetodepth_op.cc
+++ b/tensorflow/core/kernels/spacetodepth_op.cc
@ -21,6 +21,8 @@ limitations under the License.
 #include <string>
 #include <utility>

+#include "tensorflow/core/kernels/spacetodepth_op.h"
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@ -89,28 +91,44 @@ class SpaceToDepthOp : public OpKernel {
    auto Toutput = outputs_tensor->tensor<T, 4>();
    auto Tinput = input.tensor<T, 4>();

-    for (int b = 0; b < batch_size; ++b) {
-      for (int h = 0; h < height; ++h) {
-        const int out_h = h / block_size_;
-        const int offset_h = (h % block_size_);
-        for (int w = 0; w < width; ++w) {
-          const int out_w = w / block_size_;
-          const int offset_w = (w % block_size_);
-          const int offset_d =
-              (offset_h * block_size_ + offset_w) * input_depth;
-          for (int d = 0; d < input_depth; ++d) {
-            const int out_d = d + offset_d;
-            Toutput(b, out_h, out_w, out_d) = Tinput(b, h, w, d);
-          }
-        }
-      }
-    }
+    functor::SpaceToDepthOpFunctor<Device, T> functor;
+    functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput);
  };

 private:
  int block_size_;
 };

+// Partial specialization of SpaceToDepthOpFunctor for a CPUDevice.
+namespace functor {
+template <typename T>
+struct SpaceToDepthOpFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = output.dimension(0);
+    const int input_height = input.dimension(1);
+    const int input_width = input.dimension(2);
+    const int input_depth = input.dimension(3);
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int h = 0; h < input_height; ++h) {
+        const int out_h = h / block_size;
+        const int offset_h = (h % block_size);
+        for (int w = 0; w < input_width; ++w) {
+          const int out_w = w / block_size;
+          const int offset_w = (w % block_size);
+          const int offset_d = (offset_h * block_size + offset_w) * input_depth;
+          for (int d = 0; d < input_depth; ++d) {
+            const int out_d = d + offset_d;
+            output(b, out_h, out_w, out_d) = input(b, h, w, d);
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace functor
+
 #define REGISTER(type)                                                   \
  REGISTER_KERNEL_BUILDER(                                               \
      Name("SpaceToDepth").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
@ -119,4 +137,10 @@ class SpaceToDepthOp : public OpKernel {
 TF_CALL_ALL_TYPES(REGISTER);
 #undef REGISTER

+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    SpaceToDepthOp<GPUDevice, float>);
+#endif  // GOOGLE_CUDA
+
 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/spacetodepth_op.h
+++ b/tensorflow/core/kernels/spacetodepth_op.h
@ -0,0 +1,44 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_
+// Functor definition for XentOp, must be compilable by nvcc.
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by SpaceToDepthOp to do the computations.
+template <typename Device, typename T>
+struct SpaceToDepthOpFunctor {
+  // Implements the space to depth conversion.
+  //
+  // input: 4-D input tensor.
+  // block_size: block size for the conversion.
+  // output: 4-D output tensor.
+  //
+  // The dimensions of the tensors are guaranteed to be right when the
+  // functor is called.
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPACETODEPTH_OP_H_
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@ -0,0 +1,89 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/spacetodepth_op.h"
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename dtype>
+__global__ void S2D(const int32 nthreads, const dtype* input_ptr,
+                    const int block_size, const int batch_size,
+                    const int input_height, const int input_width,
+                    const int input_depth, const int output_height,
+                    const int output_width, const int output_depth,
+                    dtype* output_ptr) {
+  CUDA_1D_KERNEL_LOOP(inp_idx, nthreads) {
+    // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
+    const int d = inp_idx % input_depth;
+    const int inp_idx2 = inp_idx / input_depth;
+    const int w = inp_idx2 % input_width;
+    const int inp_idx3 = inp_idx2 / input_width;
+    const int h = inp_idx3 % input_height;
+    const int b = inp_idx3 / input_height;
+
+    const int out_h = h / block_size;
+    const int offset_h = h % block_size;
+    const int out_w = w / block_size;
+    const int offset_w = w % block_size;
+    const int offset_d = (offset_h * block_size + offset_w) * input_depth;
+    const int out_d = d + offset_d;
+    const int out_idx =
+        out_d +
+        output_depth * (out_w + output_width * (out_h + output_height * b));
+    *(output_ptr + out_idx) = ldg(input_ptr + inp_idx);
+  }
+}
+
+// Specialization of SpaceToDepthOpFunctor for a CPUDevice.
+namespace functor {
+template <typename T>
+struct SpaceToDepthOpFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = output.dimension(0);
+    const int input_height = input.dimension(1);
+    const int input_width = input.dimension(2);
+    const int input_depth = input.dimension(3);
+    const int output_height = output.dimension(1);
+    const int output_width = output.dimension(2);
+    const int output_depth = output.dimension(3);
+
+    const int total_count =
+        batch_size * input_height * input_width * input_depth;
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+    S2D<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        config.virtual_thread_count, input.data(), block_size, batch_size,
+        input_height, input_width, input_depth, output_height, output_width,
+        output_depth, output.data());
+  }
+};
+}  // end namespace functor
+
+// Instantiate the GPU implementation for float.
+template struct functor::SpaceToDepthOpFunctor<GPUDevice, float>;
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@ -55,8 +56,8 @@ class InvertPermutationOp : public OpKernel {
    auto Tout = output->vec<int32>();
    std::fill_n(Tout.data(), N, -1);
    for (int i = 0; i < N; ++i) {
-      const int32 d = Tin(i);
-      OP_REQUIRES(context, 0 <= d && d < N,
+      const int32 d = internal::SubtleMustCopy(Tin(i));
+      OP_REQUIRES(context, FastBoundsCheck(d, N),
                  errors::InvalidArgument(d, " is not between 0 and ", N));
      OP_REQUIRES(context, Tout(d) == -1,
                  errors::InvalidArgument(d, " is duplicated in the input."));
@ -107,18 +108,26 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
              errors::InvalidArgument(
                  "transpose expects a vector of size ", input.dims(),
                  ". But input(1) is a vector of size ", Vperm.size()));
-  gtl::ArraySlice<int32> permutation(
-      reinterpret_cast<const int32*>(Vperm.data()), dims);
+  // using volatile instead of SubtleMustCopy here so that the
+  // asynchrony boundary is permutation.
+  const volatile int32* perm_begin =
+      reinterpret_cast<const volatile int32*>(Vperm.data());
+  const std::vector<int32> permutation(perm_begin, perm_begin + dims);
  TensorShape shape;

  // Check whether permutation is a permutation of integers of [0 .. dims).
  gtl::InlinedVector<bool, 8> bits(dims);
-  for (const int32 d : permutation) {
+  bool is_identity = true;
+  for (int i = 0; i < dims; ++i) {
+    const int32 d = permutation[i];
    OP_REQUIRES(
        ctx, 0 <= d && d < dims,
        errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")"));
    bits[d] = true;
    shape.AddDim(input.dim_size(d));
+    if (d != i) {
+      is_identity = false;
+    }
  }
  for (int i = 0; i < dims; ++i) {
    OP_REQUIRES(ctx, bits[i], errors::InvalidArgument(
@ -126,8 +135,8 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
                                  str_util::Join(permutation, ","), "}."));
  }

-  // 0-D and 1-D transposes do nothing
-  if (dims <= 1) {
+  // 0-D, 1-D, and identity transposes do nothing.
+  if (dims <= 1 || is_identity) {
    ctx->set_output(0, input);
    return;
  }
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@ -139,7 +139,8 @@ class Session {

  /// \brief Like `Run`, but allows users to pass in a `RunOptions` proto and
  /// to retrieve non-Tensor metadata output via a `RunOutputs` proto for this
-  /// step.
+  /// step.  `run_outputs` may be nullptr, in which case any metadata output is
+  /// discarded.
  /// NOTE: This API is still experimental and may change.
  virtual Status Run(const RunOptions& run_options,
                     const std::vector<std::pair<string, Tensor> >& inputs,
@ -148,8 +149,8 @@ class Session {
                     std::vector<Tensor>* outputs, RunOutputs* run_outputs);

  /// \brief Sets up a graph for partial execution. All future feeds and
-  /// fetches are specified by 'input_names' and 'output_names'. Returns
-  /// 'handle' that can be used to perform a sequence of partial feeds and
+  /// fetches are specified by `input_names` and `output_names`. Returns
+  /// `handle` that can be used to perform a sequence of partial feeds and
  /// fetches.
  /// NOTE: This API is still experimental and may change.
  virtual Status PRunSetup(const std::vector<string>& input_names,
@ -157,7 +158,7 @@ class Session {
                           const std::vector<string>& target_nodes,
                           string* handle);

-  /// \brief Continues the pending execution specified by 'handle' with the
+  /// \brief Continues the pending execution specified by `handle` with the
  /// provided input tensors and fills `outputs` for the endpoints specified
  /// in `output_names`.
  /// NOTE: This API is still experimental and may change.
--- a/tensorflow/core/public/tensor_c_api.h
+++ b/tensorflow/core/public/tensor_c_api.h
@ -268,15 +268,26 @@ extern void TF_ExtendGraph(TF_Session*, const void* proto, size_t proto_len,
 // failure, inputs[] become the property of the implementation (the
 // implementation will eventually call TF_DeleteTensor on each input).
 //
-// The caller retains the ownership of both `run_options` and `run_outputs`, and
-// should manually call TF_DeleteBuffer on them.
+// Any NULL and non-NULL value combinations for (`run_options`,
+// `run_outputs`) are valid.
+//
+//    - `run_options` may be NULL, in which case it will be ignored; or
+//      non-NULL, in which case it must point to a `TF_Buffer` containing the
+//      serialized representation of a `RunOptions` protocol buffer.
+//    - `run_output` may be NULL, in which case it will be ignored; or non-NULL,
+//      in which case it must point to an empty, freshly allocated `TF_Buffer`
+//      that may be updated to contain the serialized representation of a
+//      `RunOutput` protocol buffer.
+//
+// The caller retains the ownership of `run_options` and/or `run_outputs` (when
+// not NULL) and should manually call TF_DeleteBuffer on them.
 //
 // On success, the tensors corresponding to output_names[0,noutputs-1]
 // are placed in outputs[], and these outputs[] become the property
 // of the caller (the caller must eventually call TF_DeleteTensor on
 // them).
 //
-// On failure, outputs[] contains nulls.
+// On failure, outputs[] contains NULLs.
 extern void TF_Run(TF_Session*,
                   // RunOptions
                   const TF_Buffer* run_options,
@ -341,7 +352,7 @@ extern void TF_PRun(TF_Session*, const char* handle,
 // On success, place OK in status and return the newly created library handle.
 // The caller owns the library handle.
 //
-// On failure, place an error status in status and return nullptr.
+// On failure, place an error status in status and return NULL.
 extern TF_Library* TF_LoadLibrary(const char* library_filename,
                                  TF_Status* status);

--- a/tensorflow/core/util/work_sharder.cc
+++ b/tensorflow/core/util/work_sharder.cc
@ -39,8 +39,10 @@ void Shard(int num_workers, thread::ThreadPool* workers, int64 total,
  // much. Let us assume each cost unit is 1ns, kMinCostPerShard=10000
  // is 10us.
  static const int64 kMinCostPerShard = 10000;
-  const int num_shards = std::max(
-      1, std::min<int>(num_workers, total * cost_per_unit / kMinCostPerShard));
+  const int num_shards =
+      std::max<int>(1, std::min(static_cast<int64>(num_workers),
+                                total * cost_per_unit / kMinCostPerShard));
+
  // Each shard contains up to "block_size" units. [0, total) is sharded
  // into:
  //   [0, block_size), [block_size, 2*block_size), ...
--- a/tensorflow/core/util/work_sharder_test.cc
+++ b/tensorflow/core/util/work_sharder_test.cc
@ -59,6 +59,25 @@ TEST(Shard, Basic) {
  }
 }

+TEST(Shard, OverflowTest) {
+  thread::ThreadPool threads(Env::Default(), "test", 3);
+  mutex mu;
+  for (auto workers : {1, 2, 3}) {
+    const int64 total_elements = 1LL << 32;
+    const int64 cost_per_unit = 10000;
+    int num_shards = 0;
+    int64 num_elements = 0;
+    Shard(workers, &threads, total_elements, cost_per_unit,
+          [&mu, &num_shards, &num_elements](int64 start, int64 limit) {
+            mutex_lock l(mu);
+            ++num_shards;
+            num_elements += limit - start;
+          });
+    EXPECT_EQ(num_shards, workers);
+    EXPECT_EQ(num_elements, total_elements);
+  }
+}
+
 void BM_Sharding(int iters, int arg) {
  thread::ThreadPool threads(Env::Default(), "test", 16);
  const int64 total = 1LL << 30;
--- a/tensorflow/examples/android/jni/jni_utils.cc
+++ b/tensorflow/examples/android/jni/jni_utils.cc
@ -157,3 +157,17 @@ void ReadFileToVector(AAssetManager* const asset_manager,
  VLOG(0) << "Read " << str_vector->size() << " values from " << filename;
 }

+void WriteProtoToFile(const char* const filename,
+                      const google::protobuf::MessageLite& message) {
+  std::fstream outfile;
+  outfile.open(filename, std::fstream::binary | std::fstream::out);
+  if (outfile.fail()) {
+    LOG(WARNING) << "Failed to write proto to " << filename;
+    return;
+  } else {
+    google::protobuf::io::OstreamOutputStream raw_out(&outfile);
+    google::protobuf::io::CodedOutputStream coded_out(&raw_out);
+    message.SerializeToCodedStream(&coded_out);
+  }
+  VLOG(0) << "Wrote proto to " << filename;
+}
--- a/tensorflow/examples/android/jni/jni_utils.h
+++ b/tensorflow/examples/android/jni/jni_utils.h
@ -42,4 +42,7 @@ void ReadFileToString(AAssetManager* const asset_manager,
 void ReadFileToVector(AAssetManager* const asset_manager,
    const char* const filename, std::vector<std::string>* str_vector);

+void WriteProtoToFile(const char* const filename,
+                      const google::protobuf::MessageLite& message);
+
 #endif  // ORG_TENSORFLOW_JNI_JNI_UTILS_H_
--- a/tensorflow/examples/android/jni/tensorflow_jni.cc
+++ b/tensorflow/examples/android/jni/tensorflow_jni.cc
@ -21,13 +21,16 @@ limitations under the License.

 #include <jni.h>
 #include <pthread.h>
+#include <sys/stat.h>
 #include <unistd.h>
 #include <queue>
 #include <sstream>
 #include <string>

+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@ -51,6 +54,12 @@ static int g_image_mean;  // The image mean.
 static int g_num_runs = 0;
 static int64 g_timing_total_us = 0;

+#ifdef SAVE_STEP_STATS
+static const bool kSaveStepStats = true;
+#else
+static const bool kSaveStepStats = false;
+#endif
+
 inline static int64 CurrentThreadTimeUs() {
  struct timeval tv;
  gettimeofday(&tv, NULL);
@ -199,11 +208,30 @@ static std::string ClassifyImage(const RGBA* const bitmap_src,
  std::vector<tensorflow::Tensor> output_tensors;
  std::vector<std::string> output_names({"output:0"});

-  const int64 start_time = CurrentThreadTimeUs();
-  tensorflow::Status s =
-      session->Run(input_tensors, output_names, {}, &output_tensors);
-  const int64 end_time = CurrentThreadTimeUs();
+  tensorflow::Status s;
+  int64 start_time, end_time;

+  if (kSaveStepStats) {
+    RunOptions run_options;
+    run_options.set_trace_level(RunOptions::FULL_TRACE);
+    RunOutputs run_outputs;
+    start_time = CurrentThreadTimeUs();
+    s = session->Run(run_options, input_tensors, output_names, {},
+                     &output_tensors, &run_outputs);
+    end_time = CurrentThreadTimeUs();
+    assert(run_outputs.has_step_stats());
+
+    const StepStats& stats = run_outputs.step_stats();
+
+    mkdir("/sdcard/tf/", 0755);
+    const string filename =
+        strings::Printf("/sdcard/tf/stepstats%05d.pb", g_num_runs);
+    WriteProtoToFile(filename.c_str(), stats);
+  } else {
+    start_time = CurrentThreadTimeUs();
+    s = session->Run(input_tensors, output_names, {}, &output_tensors);
+    end_time = CurrentThreadTimeUs();
+  }
  const int64 elapsed_time_inf = end_time - start_time;
  g_timing_total_us += elapsed_time_inf;
  VLOG(0) << "End computing. Ran in " << elapsed_time_inf / 1000 << "ms ("
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -40,6 +40,7 @@ py_library(
    name = "platform",
    srcs = glob(["platform/**/*.py"]),
    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/core:protos_all_py"],
 )

 py_library(
@ -1006,6 +1007,7 @@ py_test(
    name = "session_test",
    srcs = ["client/session_test.py"],
    srcs_version = "PY2AND3",
+    tags = ["noasan"],
    deps = [
        ":framework",
        ":framework_test_lib",
@ -1034,12 +1036,12 @@ cpu_only_kernel_test_list = glob([
    "kernel_tests/attention_ops_test.py",
    "kernel_tests/barrier_ops_test.py",
    "kernel_tests/bcast_ops_test.py",
+    "kernel_tests/benchmark_test.py",
    "kernel_tests/candidate_sampler_ops_test.py",
    "kernel_tests/cholesky_op_test.py",
    "kernel_tests/clip_ops_test.py",
    "kernel_tests/decode_csv_op_test.py",
    "kernel_tests/decode_raw_op_test.py",
-    "kernel_tests/depthtospace_op_test.py",
    "kernel_tests/determinant_op_test.py",
    "kernel_tests/diag_op_test.py",
    "kernel_tests/edit_distance_op_test.py",
@ -1069,7 +1071,6 @@ cpu_only_kernel_test_list = glob([
    "kernel_tests/sparse_reorder_op_test.py",
    "kernel_tests/sparse_to_dense_op_test.py",
    "kernel_tests/sparsemask_op_test.py",
-    "kernel_tests/spacetodepth_op_test.py",
    "kernel_tests/summary_ops_test.py",
    "kernel_tests/template_test.py",
    "kernel_tests/topk_op_test.py",
--- a/tensorflow/python/init.py
+++ b/tensorflow/python/init.py
@ -59,7 +59,7 @@ from tensorflow.core.framework.attr_value_pb2 import *
 from tensorflow.core.protobuf.config_pb2 import *
 from tensorflow.core.util.event_pb2 import *
 # Import things out of contrib
-from tensorflow import contrib
+import tensorflow.contrib as contrib

 # Framework
 from tensorflow.python.framework.framework_lib import *
@ -101,6 +101,7 @@ from tensorflow.python.framework import framework_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import constant_op
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import histogram_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
@ -117,8 +118,8 @@ _whitelist = set([app, compat, contrib, errors, flags, gfile, image,
 # strings of other modules.
 __all__ = make_all(__name__,
                   [framework_lib, array_ops, client_lib, constant_op,
-                    control_flow_ops, io_ops, math_ops, nn, script_ops,
-                    sparse_ops, state_ops, train])
+                    control_flow_ops, histogram_ops, io_ops, math_ops, nn,
+                    script_ops, sparse_ops, state_ops, train])

 # Symbols whitelisted for export without documentation.
 # TODO(cwhipkey): review these and move to contrib, expose through
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@ -294,7 +294,7 @@ class BaseSession(SessionInterface):
      [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue).

    The optional `options` argument expects a [`RunOptions`] proto. The options
-    allow controling the behavior of this particular step (e.g. turning tracing
+    allow controlling the behavior of this particular step (e.g. turning tracing
    on).

    The optional `run_outputs` argument expects a [`RunOutputs`] proto. When
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@ -25,7 +25,6 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin

-from tensorflow.core.framework import step_stats_pb2
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@ -927,13 +926,32 @@ class SessionTest(test_util.TensorFlowTestCase):
        sess.run(constant_op.constant(1.0),
                 options=run_options,
                 run_outputs=run_outputs)
+
        self.assertTrue(run_outputs.HasField('step_stats'))
+        self.assertEquals(len(run_outputs.step_stats.dev_stats), 1)

-        step_stats = step_stats_pb2.StepStats()
-        self.assertEquals(len(step_stats.dev_stats), 0)
+  def testRunOptionsRunOutputs(self):
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_outputs = config_pb2.RunOutputs()

-        step_stats.CopyFrom(run_outputs.step_stats)
-        self.assertEquals(len(step_stats.dev_stats), 1)
+    with ops.device('/cpu:0'):
+      with session.Session() as sess:
+        # all combinations are valid
+        sess.run(constant_op.constant(1.0), options=None, run_outputs=None)
+        sess.run(constant_op.constant(1.0), options=None,
+                 run_outputs=run_outputs)
+        self.assertTrue(not run_outputs.HasField('step_stats'))
+
+        sess.run(constant_op.constant(1.0), options=run_options,
+                 run_outputs=None)
+        self.assertTrue(not run_outputs.HasField('step_stats'))
+
+        sess.run(constant_op.constant(1.0), options=run_options,
+                 run_outputs=run_outputs)
+
+        self.assertTrue(run_outputs.HasField('step_stats'))
+        self.assertEquals(len(run_outputs.step_stats.dev_stats), 1)

  def testFeedShapeCompatibility(self):
    with session.Session() as sess:
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ b/tensorflow/python/framework/gen_docs_combined.py
@ -81,6 +81,7 @@ def all_libraries(module_to_name, members, documented):
              exclude_symbols=["sparse_matmul", "arg_min", "arg_max",
                               "lin_space", "sparse_segment_mean_grad"],
              prefix=PREFIX_TEXT),
+      library("histogram_ops", "Histograms"),
      library("control_flow_ops", "Control Flow", prefix=PREFIX_TEXT),
      library("image", "Images", tf.image, exclude_symbols=["ResizeMethod"],
              prefix=PREFIX_TEXT),
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@ -165,9 +165,8 @@ class TensorFlowTestCase(googletest.TestCase):
      text_format.Merge(expected_message_maybe_ascii, expected_message)
      self._AssertProtoEquals(expected_message, message)
    else:
-      assert False, ("Can't compare protos of type " +
-                     type(expected_message_maybe_ascii) + " and " +
-                     type(message))
+      assert False, ("Can't compare protos of type %s and %s" %
+                     (type(expected_message_maybe_ascii), type(message)))

  def assertProtoEqualsVersion(
      self, expected, actual, producer=versions.GRAPH_DEF_VERSION,
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@ -0,0 +1,158 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tensorflow.python.framework.importer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+from tensorflow.core.util import test_log_pb2
+from tensorflow.python.platform import benchmark
+
+
+# Used by SomeRandomBenchmark class below.
+_ran_somebenchmark_1 = [False]
+_ran_somebenchmark_2 = [False]
+_ran_somebenchmark_but_shouldnt = [False]
+
+
+class SomeRandomBenchmark(tf.test.Benchmark):
+  """This Benchmark should automatically be registered in the registry."""
+
+  def _dontRunThisBenchmark(self):
+    _ran_somebenchmark_but_shouldnt[0] = True
+
+  def notBenchmarkMethod(self):
+    _ran_somebenchmark_but_shouldnt[0] = True
+
+  def benchmark1(self):
+    _ran_somebenchmark_1[0] = True
+
+  def benchmark2(self):
+    _ran_somebenchmark_2[0] = True
+
+
+class TestReportingBenchmark(tf.test.Benchmark):
+  """This benchmark (maybe) reports some stuff."""
+
+  def benchmarkReport1(self):
+    self.report_benchmark(iters=1)
+
+  def benchmarkReport2(self):
+    self.report_benchmark(
+        iters=2, name="custom_benchmark_name",
+        extras={"number_key": 3, "other_key": "string"})
+
+
+class BenchmarkTest(tf.test.TestCase):
+
+  def testGlobalBenchmarkRegistry(self):
+    registry = list(benchmark.GLOBAL_BENCHMARK_REGISTRY)
+    self.assertEqual(len(registry), 2)
+    self.assertTrue(SomeRandomBenchmark in registry)
+    self.assertTrue(TestReportingBenchmark in registry)
+
+  def testRunSomeRandomBenchmark(self):
+    # Validate that SomeBenchmark has not run yet
+    self.assertFalse(_ran_somebenchmark_1[0])
+    self.assertFalse(_ran_somebenchmark_2[0])
+    self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
+
+    # Run other benchmarks, but this wont run the one we care about
+    benchmark._run_benchmarks("unrelated")
+
+    # Validate that SomeBenchmark has not run yet
+    self.assertFalse(_ran_somebenchmark_1[0])
+    self.assertFalse(_ran_somebenchmark_2[0])
+    self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
+
+    # Run all the benchmarks, avoid generating any reports
+    if benchmark.TEST_REPORTER_TEST_ENV in os.environ:
+      del os.environ[benchmark.TEST_REPORTER_TEST_ENV]
+    benchmark._run_benchmarks("SomeRandom")
+
+    # Validate that SomeRandomBenchmark ran correctly
+    self.assertTrue(_ran_somebenchmark_1[0])
+    self.assertTrue(_ran_somebenchmark_2[0])
+    self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
+
+  def testReportingBenchmark(self):
+    tempdir = tf.test.get_temp_dir()
+    try:
+      tf.gfile.MakeDirs(tempdir)
+    except OSError as e:
+      # It's OK if the directory already exists.
+      if " exists:" not in str(e):
+        raise e
+
+    prefix = os.path.join(
+        tempdir, "reporting_bench_%016x_" % random.getrandbits(64))
+    expected_output_file = "%s%s" % (
+        prefix, "TestReportingBenchmark.benchmarkReport1")
+    expected_output_file_2 = "%s%s" % (
+        prefix, "TestReportingBenchmark.custom_benchmark_name")
+    try:
+      self.assertFalse(tf.gfile.Exists(expected_output_file))
+      # Run benchmark but without env, shouldn't write anything
+      if benchmark.TEST_REPORTER_TEST_ENV in os.environ:
+        del os.environ[benchmark.TEST_REPORTER_TEST_ENV]
+      reporting = TestReportingBenchmark()
+      reporting.benchmarkReport1()  # This should run without writing anything
+      self.assertFalse(tf.gfile.Exists(expected_output_file))
+
+      # Runbenchmark with env, should write
+      os.environ[benchmark.TEST_REPORTER_TEST_ENV] = prefix
+
+      reporting = TestReportingBenchmark()
+      reporting.benchmarkReport1()  # This should write
+      reporting.benchmarkReport2()  # This should write
+
+      # Check the files were written
+      self.assertTrue(tf.gfile.Exists(expected_output_file))
+      self.assertTrue(tf.gfile.Exists(expected_output_file_2))
+
+      # Check the contents are correct
+      expected_1 = test_log_pb2.BenchmarkEntry()
+      expected_1.name = "TestReportingBenchmark.benchmarkReport1"
+      expected_1.iters = 1
+
+      expected_2 = test_log_pb2.BenchmarkEntry()
+      expected_2.name = "TestReportingBenchmark.custom_benchmark_name"
+      expected_2.iters = 2
+      expected_2.extras["number_key"].double_value = 3
+      expected_2.extras["other_key"].string_value = "string"
+
+      read_benchmark_1 = tf.gfile.GFile(expected_output_file, "r").read()
+      read_benchmark_1 = text_format.Merge(
+          read_benchmark_1, test_log_pb2.BenchmarkEntry())
+      self.assertProtoEquals(expected_1, read_benchmark_1)
+
+      read_benchmark_2 = tf.gfile.GFile(expected_output_file_2, "r").read()
+      read_benchmark_2 = text_format.Merge(
+          read_benchmark_2, test_log_pb2.BenchmarkEntry())
+      self.assertProtoEquals(expected_2, read_benchmark_2)
+
+    finally:
+      tf.gfile.DeleteRecursively(tempdir)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@ -25,12 +25,17 @@ import tensorflow as tf

 class DepthToSpaceTest(tf.test.TestCase):

+  def _testOne(self, inputs, block_size, outputs):
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu):
+        x_tf = tf.depth_to_space(tf.to_float(inputs), block_size)
+        self.assertAllEqual(x_tf.eval(), outputs)
+
  def testBasic(self):
    x_np = [[[[1, 2, 3, 4]]]]
-    with self.test_session(use_gpu=False):
-      block_size = 2
-      x_tf = tf.depth_to_space(x_np, block_size)
-      self.assertAllEqual(x_tf.eval(), [[[[1], [2]], [[3], [4]]]])
+    block_size = 2
+    x_out = [[[[1], [2]], [[3], [4]]]]
+    self._testOne(x_np, block_size, x_out)

  # Tests for larger input dimensions. To make sure elements are
  # correctly ordered spatially.
@ -40,12 +45,28 @@ class DepthToSpaceTest(tf.test.TestCase):
             [[9, 10, 11, 12],
              [13, 14, 15, 16]]]]
    block_size = 2
-    with self.test_session(use_gpu=False):
-      x_tf = tf.depth_to_space(x_np, block_size)
-      self.assertAllEqual(x_tf.eval(), [[[[1], [2], [5], [6]],
-                                         [[3], [4], [7], [8]],
-                                         [[9], [10], [13], [14]],
-                                         [[11], [12], [15], [16]]]])
+    x_out = [[[[1], [2], [5], [6]],
+              [[3], [4], [7], [8]],
+              [[9], [10], [13], [14]],
+              [[11], [12], [15], [16]]]]
+    self._testOne(x_np, block_size, x_out)
+
+  def testBlockSize2Batch10(self):
+    block_size = 2
+    def batch_input_elt(i):
+      return [[[1 * i, 2 * i, 3 * i, 4 * i],
+               [5 * i, 6 * i, 7 * i, 8 * i]],
+              [[9 * i, 10 * i, 11 * i, 12 * i],
+               [13 * i, 14 * i, 15 * i, 16 * i]]]
+    def batch_output_elt(i):
+      return [[[1 * i], [2 * i], [5 * i], [6 * i]],
+              [[3 * i], [4 * i], [7 * i], [8 * i]],
+              [[9 * i], [10 * i], [13 * i], [14 * i]],
+              [[11 * i], [12 * i], [15 * i], [16 * i]]]
+    batch_size = 10
+    x_np = [batch_input_elt(i) for i in xrange(batch_size)]
+    x_out = [batch_output_elt(i) for i in xrange(batch_size)]
+    self._testOne(x_np, block_size, x_out)

  # Tests for different width and height.
  def testNonSquare(self):
@ -53,46 +74,42 @@ class DepthToSpaceTest(tf.test.TestCase):
             [[5, 50, 6, 60, 7, 70, 8, 80]],
             [[9, 90, 10, 100, 11, 110, 12, 120]]]]
    block_size = 2
-    with self.test_session(use_gpu=False):
-      x_tf = tf.depth_to_space(x_np, block_size)
-      self.assertAllEqual(x_tf.eval(), [[[[1, 10], [2, 20]],
-                                         [[3, 30], [4, 40]],
-                                         [[5, 50], [6, 60]],
-                                         [[7, 70], [8, 80]],
-                                         [[9, 90], [10, 100]],
-                                         [[11, 110], [12, 120]]]])
+    x_out = [[[[1, 10], [2, 20]],
+              [[3, 30], [4, 40]],
+              [[5, 50], [6, 60]],
+              [[7, 70], [8, 80]],
+              [[9, 90], [10, 100]],
+              [[11, 110], [12, 120]]]]
+    self._testOne(x_np, block_size, x_out)

  # Tests for larger input dimensions. To make sure elements are
  # correctly ordered spatially.
  def testBlockSize4FlatInput(self):
    x_np = [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]]
    block_size = 4
-    with self.test_session(use_gpu=False):
-      x_tf = tf.depth_to_space(x_np, block_size)
-      self.assertAllEqual(x_tf.eval(), [[[[1], [2], [5], [6]],
-                                         [[3], [4], [7], [8]],
-                                         [[9], [10], [13], [14]],
-                                         [[11], [12], [15], [16]]]])
+    x_out = [[[[1], [2], [5], [6]],
+              [[3], [4], [7], [8]],
+              [[9], [10], [13], [14]],
+              [[11], [12], [15], [16]]]]
+    self._testOne(x_np, block_size, x_out)

  # Tests for larger input depths.
  # To make sure elements are properly interleaved in depth.
  def testDepthInterleaved(self):
    x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40]]]]
    block_size = 2
-    with self.test_session(use_gpu=False):
-      x_tf = tf.depth_to_space(x_np, block_size)
-      self.assertAllEqual(x_tf.eval(), [[[[1, 10], [2, 20]],
-                                         [[3, 30], [4, 40]]]])
+    x_out = [[[[1, 10], [2, 20]],
+              [[3, 30], [4, 40]]]]
+    self._testOne(x_np, block_size, x_out)

  # Tests for larger input depths. Here an odd depth.
  # To make sure elements are properly interleaved in depth.
  def testDepthInterleavedDepth3(self):
    x_np = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
    block_size = 2
-    with self.test_session(use_gpu=False):
-      x_tf = tf.depth_to_space(x_np, block_size)
-      self.assertAllEqual(x_tf.eval(), [[[[1, 2, 3], [4, 5, 6]],
-                                         [[7, 8, 9], [10, 11, 12]]]])
+    x_out = [[[[1, 2, 3], [4, 5, 6]],
+              [[7, 8, 9], [10, 11, 12]]]]
+    self._testOne(x_np, block_size, x_out)

  # Tests for larger input depths.
  # To make sure elements are properly interleaved in depth.
@ -102,13 +119,11 @@ class DepthToSpaceTest(tf.test.TestCase):
             [[9, 90, 10, 100, 11, 110, 12, 120],
              [13, 130, 14, 140, 15, 150, 16, 160]]]]
    block_size = 2
-    with self.test_session(use_gpu=False):
-      x_tf = tf.depth_to_space(x_np, block_size)
-      self.assertAllEqual(x_tf.eval(),
-                          [[[[1, 10], [2, 20], [5, 50], [6, 60]],
-                            [[3, 30], [4, 40], [7, 70], [8, 80]],
-                            [[9, 90], [10, 100], [13, 130], [14, 140]],
-                            [[11, 110], [12, 120], [15, 150], [16, 160]]]])
+    x_out = [[[[1, 10], [2, 20], [5, 50], [6, 60]],
+              [[3, 30], [4, 40], [7, 70], [8, 80]],
+              [[9, 90], [10, 100], [13, 130], [14, 140]],
+              [[11, 110], [12, 120], [15, 150], [16, 160]]]]
+    self._testOne(x_np, block_size, x_out)

  # Error handling:

@ -205,5 +220,6 @@ class DepthToSpaceGradientTest(tf.test.TestCase):
    block_size = 3
    self._compare(1, 2, 3, 2, block_size)

+
 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@ -184,7 +184,8 @@ class RNNCellTest(tf.test.TestCase):
        x = tf.zeros([1, 1], dtype=tf.int32)
        m = tf.zeros([1, 2])
        g, new_m = tf.nn.rnn_cell.EmbeddingWrapper(
-            tf.nn.rnn_cell.GRUCell(2), 3)(x, m)
+            tf.nn.rnn_cell.GRUCell(2),
+            embedding_classes=3, embedding_size=2)(x, m)
        sess.run([tf.initialize_all_variables()])
        res = sess.run([g, new_m], {x.name: np.array([[1]]),
                                    m.name: np.array([[0.1, 0.1]])})
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import sys
 import time
 import timeit

@ -953,6 +952,7 @@ def graph_creation_static_vs_dynamic_rnn_benchmark(max_time):

  print("%d \t %f \t %f \t %f" %
        (max_time, delta_static, delta_dynamic, delta_dynamic/delta_static))
+  return delta_static, delta_dynamic


 def _timer(sess, ops):
@ -1013,6 +1013,8 @@ def static_vs_dynamic_rnn_benchmark(batch_size, max_time, num_units, use_gpu):
        (batch_size, max_time, num_units, use_gpu, delta_static,
         delta_dynamic, delta_dynamic/delta_static))

+  return delta_static, delta_dynamic
+

 def _dynamic_rnn_swap_memory_benchmark(inputs_t, sequence_length,
                                       swap_memory):
@ -1061,6 +1063,7 @@ def dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units):

  print("%d \t %d \t %d \t %f \t %f \t %f" %
        (batch_size, max_time, num_units, no_swap, swap, swap/no_swap))
+  return no_swap, swap


 def rnn_long_sequence_benchmark(batch_size, seqlen, num_units,
@ -1097,34 +1100,55 @@ def rnn_long_sequence_benchmark(batch_size, seqlen, num_units,
           elapsed/seqlen))


-def main(_):
-  print("Graph Creation: Static Unroll vs. Dynamic Unroll LSTM")
-  print("max_t \t dt(static) \t dt(dynamic) \t dt(dynamic)/dt(static)")
-  for max_time in (1, 25, 50):
-    graph_creation_static_vs_dynamic_rnn_benchmark(max_time)
+class BenchmarkRNN(tf.test.Benchmark):

-  print("Calculation: Static Unroll with Dynamic Flow LSTM "
-        "vs. Dynamic Unroll LSTM")
-  print("batch \t max_t \t units \t gpu \t dt(static) \t dt(dynamic) "
-        "\t dt(dynamic)/dt(static)")
-  for batch_size in (256,):
-    for max_time in (50,):
-      for num_units in (512, 256, 128):
-        for use_gpu in (False, True):
-          static_vs_dynamic_rnn_benchmark(
-              batch_size, max_time, num_units, use_gpu)
+  def benchmarkGraphCreationStaticVsDynamicLSTM(self):
+    print("Graph Creation: Static Unroll vs. Dynamic Unroll LSTM")
+    print("max_t \t dt(static) \t dt(dynamic) \t dt(dynamic)/dt(static)")
+    for max_time in (1, 25, 50):
+      s_dt, d_dt = graph_creation_static_vs_dynamic_rnn_benchmark(max_time)
+      self.report_benchmark(name="graph_creation_time_static_T%02d" % max_time,
+                            iters=5, wall_time=s_dt)
+      self.report_benchmark(name="graph_creation_time_dynamic_T%02d" % max_time,
+                            iters=5, wall_time=d_dt)

-  print("Calculation: Dynamic LSTM No Memory Swap vs. Memory Swap")
-  print("batch \t max_t \t units \t no_swap \t swap \t swap/no_swap")
-  for batch_size in (256, 512):
-    for max_time in (100,):
-      for num_units in (512, 256, 128):
-        dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units)
+  def benchmarkStaticUnrollVsDynamicFlowLSTM(self):
+    print("Calculation: Static Unroll with Dynamic Flow LSTM "
+          "vs. Dynamic Unroll LSTM")
+    print("batch \t max_t \t units \t gpu \t dt(static) \t dt(dynamic) "
+          "\t dt(dynamic)/dt(static)")
+    for batch_size in (256,):
+      for max_time in (50,):
+        for num_units in (512, 256, 128):
+          for use_gpu in (False, True):
+            s_dt, d_dt = static_vs_dynamic_rnn_benchmark(
+                batch_size, max_time, num_units, use_gpu)
+            self.report_benchmark(
+                name="static_unroll_time_T%02d_B%03d_N%03d_gpu_%s"
+                % (max_time, batch_size, num_units, use_gpu),
+                iters=10, wall_time=s_dt)
+            self.report_benchmark(
+                name="dynamic_unroll_time_T%02d_B%03d_N%03d_gpu_%s"
+                % (max_time, batch_size, num_units, use_gpu),
+                iters=10, wall_time=d_dt)
+
+  def benchmarkDynamicLSTMNoMemorySwapVsMemorySwap(self):
+    print("Calculation: Dynamic LSTM No Memory Swap vs. Memory Swap")
+    print("batch \t max_t \t units \t no_swap \t swap \t swap/no_swap")
+    for batch_size in (256, 512):
+      for max_time in (100,):
+        for num_units in (512, 256, 128):
+          no_swap, swap = dynamic_rnn_swap_memory_benchmark(
+              batch_size, max_time, num_units)
+          self.report_benchmark(
+              name="dynamic_lstm_no_memory_swap_T%02d_B%03d_N%03d"
+              % (max_time, batch_size, num_units),
+              iters=10, wall_time=no_swap)
+          self.report_benchmark(
+              name="dynamic_lstm_with_memory_swap_T%02d_B%03d_N%03d"
+              % (max_time, batch_size, num_units),
+              iters=10, wall_time=swap)


 if __name__ == "__main__":
-  if "--benchmarks" in sys.argv:
-    sys.argv.remove("--benchmarks")
-    tf.app.run()
-  else:
-    tf.test.main()
+  tf.test.main()
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@ -121,6 +121,13 @@ class SoftmaxTest(tf.test.TestCase):
    self._testOverflow(use_gpu=False)


+  def testEmpty(self):
+    with self.test_session():
+      x = tf.constant([[]], shape=[0, 3])
+      self.assertEqual(0, tf.size(x).eval())
+      expected_y = np.array([]).reshape(0, 3)
+      np.testing.assert_array_equal(expected_y, tf.nn.softmax(x).eval())
+

 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@ -25,13 +25,18 @@ import tensorflow as tf

 class SpaceToDepthTest(tf.test.TestCase):

+  def _testOne(self, inputs, block_size, outputs):
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu):
+        x_tf = tf.space_to_depth(tf.to_float(inputs), block_size)
+        self.assertAllEqual(x_tf.eval(), outputs)
+
  def testBasic(self):
    x_np = [[[[1], [2]],
             [[3], [4]]]]
-    with self.test_session(use_gpu=False):
-      block_size = 2
-      out_tf = tf.space_to_depth(x_np, block_size)
-      self.assertAllEqual(out_tf.eval(), [[[[1, 2, 3, 4]]]])
+    block_size = 2
+    x_out = [[[[1, 2, 3, 4]]]]
+    self._testOne(x_np, block_size, x_out)

  # Tests for larger input dimensions. To make sure elements are
  # correctly ordered spatially.
@ -40,14 +45,12 @@ class SpaceToDepthTest(tf.test.TestCase):
             [[3], [4], [7], [8]],
             [[9], [10], [13], [14]],
             [[11], [12], [15], [16]]]]
-
-    with self.test_session(use_gpu=False):
-      block_size = 2
-      out_tf = tf.space_to_depth(x_np, block_size)
-      self.assertAllEqual(out_tf.eval(), [[[[1, 2, 3, 4],
-                                            [5, 6, 7, 8]],
-                                           [[9, 10, 11, 12],
-                                            [13, 14, 15, 16]]]])
+    block_size = 2
+    x_out = [[[[1, 2, 3, 4],
+               [5, 6, 7, 8]],
+              [[9, 10, 11, 12],
+               [13, 14, 15, 16]]]]
+    self._testOne(x_np, block_size, x_out)

  # Tests for larger input dimensions. To make sure elements are
  # correctly ordered in depth. Here, larger block size.
@ -56,34 +59,27 @@ class SpaceToDepthTest(tf.test.TestCase):
             [[3], [4], [7], [8]],
             [[9], [10], [13], [14]],
             [[11], [12], [15], [16]]]]
-
-    with self.test_session(use_gpu=False):
-      block_size = 4
-      out_tf = tf.space_to_depth(x_np, block_size)
-      self.assertAllEqual(
-          out_tf.eval(),
-          [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]])
+    block_size = 4
+    x_out = [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]]
+    self._testOne(x_np, block_size, x_out)

  # Tests for larger input depths.
  # To make sure elements are properly interleaved in depth.
  def testDepthInterleaved(self):
    x_np = [[[[1, 10], [2, 20]],
             [[3, 30], [4, 40]]]]
-    with self.test_session(use_gpu=False):
-      block_size = 2
-      out_tf = tf.space_to_depth(x_np, block_size)
-      self.assertAllEqual(out_tf.eval(), [[[[1, 10, 2, 20, 3, 30, 4, 40]]]])
+    block_size = 2
+    x_out = [[[[1, 10, 2, 20, 3, 30, 4, 40]]]]
+    self._testOne(x_np, block_size, x_out)

  # Tests for larger input depths. Here an odd depth.
  # To make sure elements are properly interleaved in depth.
  def testDepthInterleavedDepth3(self):
    x_np = [[[[1, 2, 3], [4, 5, 6]],
             [[7, 8, 9], [10, 11, 12]]]]
-    with self.test_session(use_gpu=False):
-      block_size = 2
-      out_tf = tf.space_to_depth(x_np, block_size)
-      self.assertAllEqual(out_tf.eval(),
-                          [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]])
+    block_size = 2
+    x_out = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+    self._testOne(x_np, block_size, x_out)

  # Tests for larger input dimensions AND for larger input depths.
  # To make sure elements are properly interleaved in depth and ordered
@ -93,14 +89,29 @@ class SpaceToDepthTest(tf.test.TestCase):
             [[3, 30], [4, 40], [7, 70], [8, 80]],
             [[9, 90], [10, 100], [13, 130], [14, 140]],
             [[11, 110], [12, 120], [15, 150], [16, 160]]]]
-    with self.test_session(use_gpu=False):
-      block_size = 2
-      out_tf = tf.space_to_depth(x_np, block_size)
-      self.assertAllEqual(out_tf.eval(),
-                          [[[[1, 10, 2, 20, 3, 30, 4, 40],
-                             [5, 50, 6, 60, 7, 70, 8, 80]],
-                            [[9, 90, 10, 100, 11, 110, 12, 120],
-                             [13, 130, 14, 140, 15, 150, 16, 160]]]])
+    block_size = 2
+    x_out = [[[[1, 10, 2, 20, 3, 30, 4, 40],
+               [5, 50, 6, 60, 7, 70, 8, 80]],
+              [[9, 90, 10, 100, 11, 110, 12, 120],
+               [13, 130, 14, 140, 15, 150, 16, 160]]]]
+    self._testOne(x_np, block_size, x_out)
+
+  def testBlockSize2Batch10(self):
+    block_size = 2
+    def batch_input_elt(i):
+      return [[[1 * i], [2 * i], [5 * i], [6 * i]],
+              [[3 * i], [4 * i], [7 * i], [8 * i]],
+              [[9 * i], [10 * i], [13 * i], [14 * i]],
+              [[11 * i], [12 * i], [15 * i], [16 * i]]]
+    def batch_output_elt(i):
+      return [[[1 * i, 2 * i, 3 * i, 4 * i],
+               [5 * i, 6 * i, 7 * i, 8 * i]],
+              [[9 * i, 10 * i, 11 * i, 12 * i],
+               [13 * i, 14 * i, 15 * i, 16 * i]]]
+    batch_size = 10
+    x_np = [batch_input_elt(i) for i in xrange(batch_size)]
+    x_out = [batch_output_elt(i) for i in xrange(batch_size)]
+    self._testOne(x_np, block_size, x_out)

  # Tests for different width and height.
  def testNonSquare(self):
@ -110,13 +121,11 @@ class SpaceToDepthTest(tf.test.TestCase):
             [[7, 70], [8, 80]],
             [[9, 90], [10, 100]],
             [[11, 110], [12, 120]]]]
-    with self.test_session(use_gpu=False):
-      block_size = 2
-      out_tf = tf.space_to_depth(x_np, block_size)
-      self.assertAllEqual(out_tf.eval(),
-                          [[[[1, 10, 2, 20, 3, 30, 4, 40]],
-                            [[5, 50, 6, 60, 7, 70, 8, 80]],
-                            [[9, 90, 10, 100, 11, 110, 12, 120]]]])
+    block_size = 2
+    x_out = [[[[1, 10, 2, 20, 3, 30, 4, 40]],
+              [[5, 50, 6, 60, 7, 70, 8, 80]],
+              [[9, 90, 10, 100, 11, 110, 12, 120]]]]
+    self._testOne(x_np, block_size, x_out)

  # Error handling:

--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@ -405,6 +405,7 @@ def boolean_mask(tensor, mask, name="boolean_mask"):
    ValueError:  If shapes do not conform.

  Examples:
+
  ```python
  # 2-D example
  a = [[1, 2], [3, 4], [5, 6]]
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@ -218,7 +218,7 @@ class QueueBase(object):
      return gen_data_flow_ops._queue_enqueue(self._queue_ref, vals, name=scope)

  def enqueue_many(self, vals, name=None):
-    """Enqueues zero or elements to this queue.
+    """Enqueues zero or more elements to this queue.

    This operation slices each component tensor along the 0th dimension to
    make multiple queue elements. All of the tensors in `vals` must have the
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Operations for histograms."""
+# pylint: disable=g-short-docstring-punctuation
+"""## Histograms
+
+@@histogram_fixed_width
+"""

 from __future__ import absolute_import
 from __future__ import division
@ -24,30 +28,34 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope


-def histogram_fixed_width(hist,
-                          new_values,
+def histogram_fixed_width(values,
                          value_range,
-                          use_locking=False,
-                          name='histogram_fixed_width'):
-  """Update histogram Variable with new values.
+                          nbins=100,
+                          use_locking=True,
+                          dtype=dtypes.int32,
+                          name=None):
+  """Return histogram of values.

-  This Op fills histogram with counts of values falling within fixed-width,
-  half-open bins.
+  Given the tensor `values`, this operation returns a rank 1 histogram counting
+  the number of entries in `values` that fell into every bin.  The bins are
+  equal width and determined by the arguments `value_range` and `nbins`.

  Args:
-    hist:  1-D mutable `Tensor`, e.g. a `Variable`.
-    new_values:  Numeric `Tensor`.
+    values:  Numeric `Tensor`.
    value_range:  Shape [2] `Tensor`.  new_values <= value_range[0] will be
      mapped to hist[0], values >= value_range[1] will be mapped to hist[-1].
      Must be same dtype as new_values.
+    nbins:  Integer number of bins in this histogram.
    use_locking:  Boolean.
      If `True`, use locking during the operation (optional).
-    name:  A name for this operation (optional).
+    dtype:  dtype for returned histogram.
+    name:  A name for this operation (defaults to 'histogram_fixed_width').

  Returns:
-    An op that updates `hist` with `new_values` when evaluated.
+    A `Variable` holding histogram of values.

  Examples:
  ```python
@ -57,24 +65,21 @@ def histogram_fixed_width(hist,
  new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]

  with tf.default_session() as sess:
-    hist = variables.Variable(array_ops.zeros(nbins, dtype=tf.int32))
-    hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
-                                                      value_range)
+    hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
    variables.initialize_all_variables().run()
-    sess.run(hist_update) => [2, 1, 1, 0, 2]
+    sess.run(hist) => [2, 1, 1, 0, 2]
  ```
  """
-  with ops.op_scope([hist, new_values, value_range], name) as scope:
-    new_values = ops.convert_to_tensor(new_values, name='new_values')
-    new_values = array_ops.reshape(new_values, [-1])
+  with variable_scope.variable_op_scope(
+      [values, value_range], name, 'histogram_fixed_width') as scope:
+    values = ops.convert_to_tensor(values, name='values')
+    values = array_ops.reshape(values, [-1])
    value_range = ops.convert_to_tensor(value_range, name='value_range')
-    dtype = hist.dtype

    # Map tensor values that fall within value_range to [0, 1].
-    scaled_values = math_ops.truediv(new_values - value_range[0],
+    scaled_values = math_ops.truediv(values - value_range[0],
                                     value_range[1] - value_range[0],
                                     name='scaled_values')
-    nbins = math_ops.cast(hist.get_shape()[0], scaled_values.dtype)

    # map tensor values within the open interval value_range to {0,.., nbins-1},
    # values outside the open interval will be zero or less, or nbins or more.
@ -87,9 +92,18 @@ def histogram_fixed_width(hist,
    # Dummy vector to scatter.
    # TODO(langmore) Replace non-ideal creation of large dummy vector once an
    # alternative to scatter is available.
-    updates = array_ops.ones([indices.get_shape()[0]], dtype=dtype)
-    return state_ops.scatter_add(hist,
-                                 indices,
-                                 updates,
-                                 use_locking=use_locking,
-                                 name=scope)
+    updates = array_ops.ones_like(indices, dtype=dtype)
+
+    hist = variable_scope.get_variable('hist',
+                                       initializer=array_ops.zeros_initializer(
+                                           [nbins],
+                                           dtype=dtype),
+                                       trainable=False)
+    hist_assign_zero = hist.assign(array_ops.zeros_like(hist))
+
+    with ops.control_dependencies([hist_assign_zero]):
+      return state_ops.scatter_add(hist,
+                                   indices,
+                                   updates,
+                                   use_locking=use_locking,
+                                   name=scope.name)
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@ -17,149 +17,132 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import histogram_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-
 import numpy as np
+import tensorflow as tf


-class HistogramFixedWidthTest(test_util.TensorFlowTestCase):
+class HistogramFixedWidthTest(tf.test.TestCase):

  def setUp(self):
    self.rng = np.random.RandomState(0)

+  def test_empty_input_gives_all_zero_counts(self):
+    # Bins will be:
+    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+    value_range = [0.0, 5.0]
+    values = []
+    expected_bin_counts = [0, 0, 0, 0, 0]
+    with self.test_session():
+      hist = tf.histogram_fixed_width(values, value_range, nbins=5)
+      tf.initialize_all_variables().run()
+
+      # Hist should start "fresh" with every eval.
+      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, hist.eval())
+
  def test_one_update_on_constant_input(self):
    # Bins will be:
    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-    nbins = [5]
    value_range = [0.0, 5.0]
-    new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+    values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
    expected_bin_counts = [2, 1, 1, 0, 2]
-    with self.test_session() as sess:
-      hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32))
-      hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
-                                                        value_range)
-      variables.initialize_all_variables().run()
-      self.assertTrue(hist.dtype.is_compatible_with(hist_update.dtype))
-      updated_hist_array = sess.run(hist_update)
+    with self.test_session():
+      hist = tf.histogram_fixed_width(values, value_range, nbins=5)
+      tf.initialize_all_variables().run()

-      # The new updated_hist_array is returned by the updating op.
-      self.assertAllClose(expected_bin_counts, updated_hist_array)
-
-      # hist should contain updated values, but eval() should not change it.
+      # Hist should start "fresh" with every eval.
      self.assertAllClose(expected_bin_counts, hist.eval())
      self.assertAllClose(expected_bin_counts, hist.eval())

  def test_one_update_on_constant_2d_input(self):
    # Bins will be:
    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-    nbins = [5]
    value_range = [0.0, 5.0]
-    new_values = [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]]
+    values = [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]]
    expected_bin_counts = [2, 1, 1, 0, 2]
-    with self.test_session() as sess:
-      hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32))
-      hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
-                                                        value_range)
-      variables.initialize_all_variables().run()
-      self.assertTrue(hist.dtype.is_compatible_with(hist_update.dtype))
-      updated_hist_array = sess.run(hist_update)
+    with self.test_session():
+      hist = tf.histogram_fixed_width(values, value_range, nbins=5)
+      tf.initialize_all_variables().run()

-      # The new updated_hist_array is returned by the updating op.
-      self.assertAllClose(expected_bin_counts, updated_hist_array)
-
-      # hist should contain updated values, but eval() should not change it.
+      # Hist should start "fresh" with every eval.
      self.assertAllClose(expected_bin_counts, hist.eval())
      self.assertAllClose(expected_bin_counts, hist.eval())

  def test_two_updates_on_constant_input(self):
    # Bins will be:
    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-    nbins = [5]
    value_range = [0.0, 5.0]
-    new_values_1 = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-    new_values_2 = [1.5, 4.5, 4.5, 4.5, 0.0, 0.0]
+    values_1 = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+    values_2 = [1.5, 4.5, 4.5, 4.5, 0.0, 0.0]
    expected_bin_counts_1 = [2, 1, 1, 0, 2]
-    expected_bin_counts_2 = [4, 2, 1, 0, 5]
-    with self.test_session() as sess:
-      hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32))
-      new_values = array_ops.placeholder(dtypes.float32, shape=[6])
-      hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
-                                                        value_range)
-      variables.initialize_all_variables().run()
-      updated_hist_array = sess.run(hist_update,
-                                    feed_dict={new_values: new_values_1})
+    expected_bin_counts_2 = [2, 1, 0, 0, 3]
+    with self.test_session():
+      values = tf.placeholder(tf.float32, shape=[6])
+      hist = tf.histogram_fixed_width(values, value_range, nbins=5)
+      tf.initialize_all_variables().run()

-      # The new updated_hist_array is returned by the updating op.
-      # hist should contain the updated values.
-      self.assertAllClose(expected_bin_counts_1, updated_hist_array)
-      self.assertAllClose(expected_bin_counts_1, hist.eval())
-
-      updated_hist_array = sess.run(hist_update,
-                                    feed_dict={new_values: new_values_2})
-      self.assertAllClose(expected_bin_counts_2, updated_hist_array)
-      self.assertAllClose(expected_bin_counts_2, hist.eval())
+      # The values in hist should depend on the current feed and nothing else.
+      self.assertAllClose(expected_bin_counts_1,
+                          hist.eval(feed_dict={values: values_1}))
+      self.assertAllClose(expected_bin_counts_2,
+                          hist.eval(feed_dict={values: values_2}))
+      self.assertAllClose(expected_bin_counts_1,
+                          hist.eval(feed_dict={values: values_1}))
+      self.assertAllClose(expected_bin_counts_1,
+                          hist.eval(feed_dict={values: values_1}))

  def test_two_updates_on_scalar_input(self):
    # Bins will be:
    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-    nbins = [5]
    value_range = [0.0, 5.0]
-    new_values_1 = 1.5
-    new_values_2 = 2.5
+    values_1 = 1.5
+    values_2 = 2.5
    expected_bin_counts_1 = [0, 1, 0, 0, 0]
-    expected_bin_counts_2 = [0, 1, 1, 0, 0]
-    with self.test_session() as sess:
-      hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32))
-      new_values = array_ops.placeholder(dtypes.float32, shape=[])
-      hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
-                                                        value_range)
-      variables.initialize_all_variables().run()
+    expected_bin_counts_2 = [0, 0, 1, 0, 0]
+    with self.test_session():
+      values = tf.placeholder(tf.float32, shape=[])
+      hist = tf.histogram_fixed_width(values, value_range, nbins=5)
+      tf.initialize_all_variables().run()

-      # The new updated_hist_array is returned by the updating op.
-      # hist should contain the updated values.
-      updated_hist_array = sess.run(hist_update,
-                                    feed_dict={new_values: new_values_1})
-      self.assertAllClose(expected_bin_counts_1, updated_hist_array)
-      self.assertAllClose(expected_bin_counts_1, hist.eval())
+      # The values in hist should depend on the current feed and nothing else.
+      self.assertAllClose(expected_bin_counts_2,
+                          hist.eval(feed_dict={values: values_2}))
+      self.assertAllClose(expected_bin_counts_1,
+                          hist.eval(feed_dict={values: values_1}))
+      self.assertAllClose(expected_bin_counts_1,
+                          hist.eval(feed_dict={values: values_1}))
+      self.assertAllClose(expected_bin_counts_2,
+                          hist.eval(feed_dict={values: values_2}))

-      updated_hist_array = sess.run(hist_update,
-                                    feed_dict={new_values: new_values_2})
-      self.assertAllClose(expected_bin_counts_2, updated_hist_array)
-      self.assertAllClose(expected_bin_counts_2, hist.eval())
-
-  def test_multiple_random_3d_updates_results_in_right_dist(self):
-    # Update with uniform 3-D rvs.  Resultant
+  def test_multiple_random_accumulating_updates_results_in_right_dist(self):
+    # Accumulate the updates in a new variable.  Resultant
    # histogram should be uniform.  Use only 3 bins because with many bins it
    # would be unlikely that all would be close to 1/n.  If someone ever wants
    # to test that, it would be better to check that the cdf was linear.
-    nbins = [3]
    value_range = [1.0, 4.14159]
    with self.test_session() as sess:
-      hist = variables.Variable(array_ops.zeros(nbins, dtype=dtypes.int32))
-      new_values = array_ops.placeholder(dtypes.float32, shape=[4, 4, 4])
-      hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
-                                                        value_range)
-      variables.initialize_all_variables().run()
+      values = tf.placeholder(tf.float32, shape=[4, 4, 4])
+      hist = tf.histogram_fixed_width(values,
+                                      value_range,
+                                      nbins=3,
+                                      dtype=tf.int64)
+
+      hist_accum = tf.Variable(tf.zeros_initializer([3], dtype=tf.int64))
+      hist_accum = hist_accum.assign_add(hist)
+
+      tf.initialize_all_variables().run()

      for _ in range(100):
        # Map the rv: U[0, 1] --> U[value_range[0], value_range[1]].
-        new_values_arr = (
+        values_arr = (
            value_range[0] +
            (value_range[1] - value_range[0]) * self.rng.rand(4, 4, 4))

-        # The new updated_hist_array is returned by the updating op.
-        # hist should contain the updated values.
-        updated_hist_array = sess.run(hist_update,
-                                      feed_dict={new_values: new_values_arr})
+        hist_accum_arr = sess.run(hist_accum, feed_dict={values: values_arr})

-    pmf = updated_hist_array / float(updated_hist_array.sum())
+    pmf = hist_accum_arr / float(hist_accum_arr.sum())
    np.testing.assert_allclose(1 / 3, pmf, atol=0.02)


 if __name__ == '__main__':
-  googletest.main()
+  tf.test.main()
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@ -92,6 +92,7 @@ The "producer" functions add a queue to the graph and a corresponding

@@match_filenames_once
@@limit_epochs
+@@input_producer
@@range_input_producer
@@slice_input_producer
@@string_input_producer
--- a/tensorflow/python/ops/rnn_cell.py
+++ b/tensorflow/python/ops/rnn_cell.py
@ -556,15 +556,13 @@ class EmbeddingWrapper(RNNCell):
  feed into your RNN.
  """

-  def __init__(self, cell, embedding_classes=0, embedding=None,
-               initializer=None):
+  def __init__(self, cell, embedding_classes, embedding_size, initializer=None):
    """Create a cell with an added input embedding.

    Args:
      cell: an RNNCell, an embedding will be put before its inputs.
      embedding_classes: integer, how many symbols will be embedded.
-      embedding: Variable, the embedding to use; if None, a new embedding
-        will be created; if set, then embedding_classes is not required.
+      embedding_size: integer, the size of the vectors we embed into.
      initializer: an initializer to use when creating the embedding;
        if None, the initializer from variable scope or a default one is used.

@ -574,21 +572,12 @@ class EmbeddingWrapper(RNNCell):
    """
    if not isinstance(cell, RNNCell):
      raise TypeError("The parameter cell is not RNNCell.")
-    if embedding_classes < 1 and embedding is None:
-      raise ValueError("Pass embedding or embedding_classes must be > 0: %d."
-                       % embedding_classes)
-    if embedding_classes > 0 and embedding is not None:
-      if embedding.size[0] != embedding_classes:
-        raise ValueError("You declared embedding_classes=%d but passed an "
-                         "embedding for %d classes." % (embedding.size[0],
-                                                        embedding_classes))
-      if embedding.size[1] != cell.input_size:
-        raise ValueError("You passed embedding with output size %d and a cell"
-                         " that accepts size %d." % (embedding.size[1],
-                                                     cell.input_size))
+    if embedding_classes <= 0 or embedding_size <= 0:
+      raise ValueError("Both embedding_classes and embedding_size must be > 0: "
+                       "%d, %d." % (embedding_classes, embedding_size))
    self._cell = cell
    self._embedding_classes = embedding_classes
-    self._embedding = embedding
+    self._embedding_size = embedding_size
    self._initializer = initializer

  @property
@ -607,20 +596,17 @@ class EmbeddingWrapper(RNNCell):
    """Run the cell on embedded inputs."""
    with vs.variable_scope(scope or type(self).__name__):  # "EmbeddingWrapper"
      with ops.device("/cpu:0"):
-        if self._embedding:
-          embedding = self._embedding
+        if self._initializer:
+          initializer = self._initializer
+        elif vs.get_variable_scope().initializer:
+          initializer = vs.get_variable_scope().initializer
        else:
-          if self._initializer:
-            initializer = self._initializer
-          elif vs.get_variable_scope().initializer:
-            initializer = vs.get_variable_scope().initializer
-          else:
-            # Default initializer for embeddings should have variance=1.
-            sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
-            initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
-          embedding = vs.get_variable("embedding", [self._embedding_classes,
-                                                    self._cell.input_size],
-                                      initializer=initializer)
+          # Default initializer for embeddings should have variance=1.
+          sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
+          initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
+        embedding = vs.get_variable("embedding", [self._embedding_classes,
+                                                  self._embedding_size],
+                                    initializer=initializer)
        embedded = embedding_ops.embedding_lookup(
            embedding, array_ops.reshape(inputs, [-1]))
    return self._cell(embedded, state)
--- a/tensorflow/python/ops/seq2seq.py
+++ b/tensorflow/python/ops/seq2seq.py
@ -311,7 +311,9 @@ def embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
  """
  with variable_scope.variable_scope(scope or "embedding_rnn_seq2seq"):
    # Encoder.
-    encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
+    encoder_cell = rnn_cell.EmbeddingWrapper(
+        cell, embedding_classes=num_encoder_symbols,
+        embedding_size=cell.input_size)
    _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)

    # Decoder.
@ -686,7 +688,9 @@ def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
  """
  with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"):
    # Encoder.
-    encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
+    encoder_cell = rnn_cell.EmbeddingWrapper(
+        cell, embedding_classes=num_encoder_symbols,
+        embedding_size=cell.input_size)
    encoder_outputs, encoder_state = rnn.rnn(
        encoder_cell, encoder_inputs, dtype=dtype)

@ -772,7 +776,9 @@ def one2many_rnn_seq2seq(encoder_inputs, decoder_inputs_dict, cell,

  with variable_scope.variable_scope(scope or "one2many_rnn_seq2seq"):
    # Encoder.
-    encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
+    encoder_cell = rnn_cell.EmbeddingWrapper(
+        cell, embedding_classes=num_encoder_symbols,
+        embedding_size=cell.input_size)
    _, encoder_state = rnn.rnn(encoder_cell, encoder_inputs, dtype=dtype)

    # Decoder.
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@ -774,7 +774,7 @@ def _SerializeManySparseShape(op):  # pylint: disable=invalid-name
  return [tensor_shape.matrix(None, 3)]


-def deserialize_many_sparse(serialized_sparse, dtype, name=None):
+def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
  """Deserialize and concatenate `SparseTensors` from a serialized minibatch.

  The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
@ -823,6 +823,7 @@ def deserialize_many_sparse(serialized_sparse, dtype, name=None):
    serialized_sparse: 2-D `Tensor` of type `string` of shape `[N, 3]`.
      The serialized and packed `SparseTensor' objects.
    dtype: The `dtype` of the serialized `SparseTensor` objects.
+    rank: (optional) Python int, the rank of the `SparseTensor` objects.
    name: A name prefix for the returned tensors (optional)

  Returns:
@ -835,6 +836,10 @@ def deserialize_many_sparse(serialized_sparse, dtype, name=None):
      gen_sparse_ops._deserialize_many_sparse(
          serialized_sparse, dtype, name=name))

+  # Feed rank data back in, if available
+  output_indices.set_shape([None, rank])
+  output_shape.set_shape([rank])
+
  return ops.SparseTensor(output_indices, output_values, output_shape)


--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@ -42,6 +42,7 @@ from tensorflow.python.ops.control_flow_ops import foldr
 from tensorflow.python.ops.control_flow_ops import map_fn
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.gradients import *
+from tensorflow.python.ops.histogram_ops import *
 from tensorflow.python.ops.init_ops import *
 from tensorflow.python.ops.io_ops import *
 from tensorflow.python.ops.linalg_ops import *
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@ -0,0 +1,213 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities to run benchmarks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+import numbers
+import os
+import re
+import sys
+
+import six  # pylint: disable=unused-import
+
+from google.protobuf import text_format
+from tensorflow.core.util import test_log_pb2
+from tensorflow.python.platform import app
+from tensorflow.python.platform import gfile
+
+# When a subclass of the Benchmark class is created, it is added to
+# the registry automatically
+GLOBAL_BENCHMARK_REGISTRY = set()
+
+# Environment variable that determines whether benchmarks are written.
+# See also tensorflow/core/util/reporter.h TestReporter::kTestReporterEnv.
+TEST_REPORTER_TEST_ENV = "TEST_REPORT_FILE_PREFIX"
+
+
+def _global_report_benchmark(
+    name, iters=None, cpu_time=None, wall_time=None,
+    throughput=None, extras=None):
+  """Method for recording a benchmark directly.
+
+  Args:
+    name: The BenchmarkEntry name.
+    iters: (optional) How many iterations were run
+    cpu_time: (optional) Total cpu time in seconds
+    wall_time: (optional) Total wall time in seconds
+    throughput: (optional) Throughput (in MB/s)
+    extras: (optional) Dict mapping string keys to additional benchmark info.
+
+  Raises:
+    TypeError: if extras is not a dict.
+    IOError: if the benchmark output file already exists.
+  """
+  if extras is not None:
+    if not isinstance(extras, dict):
+      raise TypeError("extras must be a dict")
+
+  test_env = os.environ.get(TEST_REPORTER_TEST_ENV, None)
+  if test_env is None:
+    # Reporting was not requested
+    return
+
+  entry = test_log_pb2.BenchmarkEntry()
+  entry.name = name
+  if iters is not None:
+    entry.iters = iters
+  if cpu_time is not None:
+    entry.cpu_time = cpu_time
+  if wall_time is not None:
+    entry.wall_time = wall_time
+  if throughput is not None:
+    entry.throughput = throughput
+  if extras is not None:
+    for (k, v) in extras.items():
+      if isinstance(v, numbers.Number):
+        entry.extras[k].double_value = v
+      else:
+        entry.extras[k].string_value = str(v)
+
+  serialized_entry = text_format.MessageToString(entry)
+
+  mangled_name = name.replace("/", "__")
+  output_path = "%s%s" % (test_env, mangled_name)
+  if gfile.Exists(output_path):
+    raise IOError("File already exists: %s" % output_path)
+  with gfile.GFile(output_path, "w") as out:
+    out.write(serialized_entry)
+
+
+class _BenchmarkRegistrar(type):
+  """The Benchmark class registrar.  Used by abstract Benchmark class."""
+
+  def __new__(mcs, clsname, base, attrs):
+    newclass = super(mcs, _BenchmarkRegistrar).__new__(
+        mcs, clsname, base, attrs)
+    if len(newclass.mro()) > 2:
+      # Only the base Benchmark abstract class has mro length 2.
+      # The rest subclass from it and are therefore registered.
+      GLOBAL_BENCHMARK_REGISTRY.add(newclass)
+    return newclass
+
+
+class Benchmark(object):
+  """Abstract class that provides helper functions for running benchmarks.
+
+  Any class subclassing this one is immediately registered in the global
+  benchmark registry.
+
+  Only methods whose names start with the word "benchmark" will be run during
+  benchmarking.
+  """
+  __metaclass__ = _BenchmarkRegistrar
+
+  def _get_name(self, overwrite_name):
+    """Returns full name of class and method calling report_benchmark."""
+
+    # Expect that the caller called report_benchmark, which called _get_name.
+    caller = inspect.stack()[2]
+    calling_class = caller[0].f_locals.get("self", None)
+    # Use the method name, or overwrite_name is provided.
+    name = overwrite_name if overwrite_name is not None else caller[3]
+    if calling_class is not None:
+      # Prefix the name with the class name.
+      class_name = type(calling_class).__name__
+      name = "%s.%s" % (class_name, name)
+    return name
+
+  def report_benchmark(
+      self,
+      iters=None,
+      cpu_time=None,
+      wall_time=None,
+      throughput=None,
+      extras=None,
+      name=None):
+    """Report a benchmark.
+
+    Args:
+      iters: (optional) How many iterations were run
+      cpu_time: (optional) Total cpu time in seconds
+      wall_time: (optional) Total wall time in seconds
+      throughput: (optional) Throughput (in MB/s)
+      extras: (optional) Dict mapping string keys to additional benchmark info.
+      name: (optional) Override the BenchmarkEntry name with `name`.
+        Otherwise it is inferred from the calling class and top-level
+        method name.
+    """
+    name = self._get_name(overwrite_name=name)
+    _global_report_benchmark(
+        name=name, iters=iters, cpu_time=cpu_time, wall_time=wall_time,
+        throughput=throughput, extras=extras)
+
+
+def _run_specific_benchmark(benchmark_class):
+  benchmark = benchmark_class()
+  attrs = dir(benchmark)
+  # Only run methods of this class whose names start with "benchmark"
+  for attr in attrs:
+    if not attr.startswith("benchmark"):
+      continue
+    benchmark_fn = getattr(benchmark, attr)
+    if not callable(benchmark_fn):
+      continue
+    # Call this benchmark method
+    benchmark_fn()
+
+
+def _run_benchmarks(regex):
+  """Run benchmarks that match regex `regex`.
+
+  This function goes through the global benchmark registry, and matches
+  benchmark **classe names** of the form "module.name.BenchmarkClass" to
+  the given regex.  If a class matches, all of its benchmark methods
+  are run.
+
+  Args:
+    regex: The string regular expression to match Benchmark classes against.
+  """
+  registry = list(GLOBAL_BENCHMARK_REGISTRY)
+
+  # Match benchmarks in registry against regex
+  for benchmark in registry:
+    benchmark_name = "%s.%s" % (benchmark.__module__, benchmark.__name__)
+    if re.search(regex, benchmark_name):
+      # Found a match
+
+      _run_specific_benchmark(benchmark)
+
+
+def benchmarks_main(true_main=None):
+  """Run benchmarks as declared in args.
+
+  Args:
+    true_main: True main function to run if benchmarks are not requested.
+  """
+  argv = sys.argv
+  found_arg = [arg for arg in argv
+               if arg.startswith("--benchmarks=")
+               or arg.startswith("-benchmarks=")]
+  if found_arg:
+    # Remove --benchmarks arg from sys.argv
+    argv.remove(found_arg[0])
+
+    regex = found_arg[0].split("=")[1]
+    app.run(lambda _: _run_benchmarks(regex))
+  else:
+    true_main()
--- a/tensorflow/python/platform/default/_app.py
+++ b/tensorflow/python/platform/default/_app.py
@ -23,8 +23,8 @@ import sys
 from tensorflow.python.platform import flags


-def run():
+def run(main=None):
  f = flags.FLAGS
  f._parse_flags()
-  main = sys.modules['__main__'].main
+  main = main or sys.modules['__main__'].main
  sys.exit(main(sys.argv))
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@ -21,7 +21,20 @@ from __future__ import print_function
 # pylint: disable=g-import-not-at-top
 # pylint: disable=wildcard-import
 from . import control_imports
+from tensorflow.python.platform import benchmark
+
+# Import the Benchmark class
+Benchmark = benchmark.Benchmark  # pylint: disable=invalid-name
+
 if control_imports.USE_OSS and control_imports.OSS_GOOGLETEST:
  from tensorflow.python.platform.default._googletest import *
+  from tensorflow.python.platform.default._googletest import main as g_main
 else:
  from tensorflow.python.platform.google._googletest import *
+  from tensorflow.python.platform.google._googletest import main as g_main
+
+
+# Redefine main to allow running benchmarks
+def main():
+  # Benchmarks determine whether to run tests or not, by calling g_main
+  benchmark.benchmarks_main(true_main=g_main)
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@ -72,6 +72,10 @@ from tensorflow.python.kernel_tests.gradient_checker import compute_gradient
 # pylint: enable=unused-import


+# Import Benchmark class
+Benchmark = googletest.Benchmark  # pylint: disable=invalid-name
+
+
 def main():
  """Runs all unit tests."""
  return googletest.main()
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@ -131,6 +131,8 @@ class Coordinator(object):
    # Event set when threads must stop.
    self._stop_event = threading.Event()
    # Python exc_info to report.
+    # If not None, it should hold the returned value of sys.exc_info(), which is
+    # a tuple containing exception (type, value, traceback).
    self._exc_info_to_raise = None

  def request_stop(self, ex=None):
@ -138,6 +140,10 @@ class Coordinator(object):

    After this is called, calls to `should_stop()` will return `True`.

+    Note: If an exception is being passed in, in must be in the context of
+    handling the exception (i.e. `try: ... except Exception as ex: ...`) and not
+    a newly created one.
+
    Args:
      ex: Optional `Exception`, or Python `exc_info` tuple as returned by
        `sys.exc_info()`.  If this is the first call to `request_stop()` the
@ -154,6 +160,22 @@ class Coordinator(object):
            logging.info("Error reported to Coordinator: %s",
                         compat.as_str_any(ex))
            self._exc_info_to_raise = sys.exc_info()
+          # self._exc_info_to_raise should contain a tuple containing exception
+          # (type, value, traceback)
+          if (len(self._exc_info_to_raise) != 3 or
+              not self._exc_info_to_raise[0] or
+              not self._exc_info_to_raise[1]):
+            # Raise, catch and record the exception here so that error happens
+            # where expected.
+            try:
+              raise ValueError(
+                  "ex must be a tuple or sys.exc_info must return the current "
+                  "exception: %s"
+                  % self._exc_info_to_raise)
+            except ValueError:
+              # Record this error so it kills the coordinator properly.
+              self._exc_info_to_raise = sys.exc_info()
+
        self._stop_event.set()

  def clear_stop(self):
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@ -84,20 +84,63 @@ def limit_epochs(tensor, num_epochs=None, name=None):
      return array_ops.identity(tensor, name=name)


-def _input_producer(input_tensor, dtype, num_epochs, shuffle, seed, capacity,
-                    shared_name, name, summary_name):
-  if shuffle:
-    input_tensor = random_ops.random_shuffle(input_tensor, seed=seed)
-  input_tensor = limit_epochs(input_tensor, num_epochs)
+def input_producer(input_tensor, element_shape=None, num_epochs=None,
+                   shuffle=True, seed=None, capacity=32, shared_name=None,
+                   summary_name=None, name=None):
+  """Output the rows of `input_tensor` to a queue for an input pipeline.

-  q = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[dtype], shapes=[[]],
-                              shared_name=shared_name, name=name)
-  enq = q.enqueue_many([input_tensor])
-  queue_runner.add_queue_runner(queue_runner.QueueRunner(q, [enq]))
-  logging_ops.scalar_summary("queue/%s/%s" % (q.name, summary_name),
-                             math_ops.cast(q.size(), dtypes.float32) *
-                             (1. / capacity))
-  return q
+  Args:
+    input_tensor: A tensor with the rows to produce. Must be at
+      one-dimensional. Must either have a fully-defined shape, or
+      `element_shape` must be defined.
+    element_shape: (Optional.) A `TensorShape` representing the shape of a
+      row of `input_tensor`, if it cannot be inferred.
+    num_epochs: (Optional.) An integer. If specified `input_producer` produces
+      each row of `input_tensor` `num_epochs` times before generating an
+      `OutOfRange` error. If not specified, `input_producer` can cycle through
+      the rows of `input_tensor` an unlimited number of times.
+    shuffle: (Optional.) A boolean. If true, the rows are randomly shuffled
+      within each eopch.
+    seed: (Optional.) An integer. The seed to use if `shuffle` is true.
+    capacity: (Optional.) The capacity of the queue to be used for buffering
+      the input.
+    shared_name: (Optional.) If set, this queue will be shared under the given
+      name across multiple sessions.
+    summary_name: (Optional.) If set, a scalar summary for the current queue
+      size will be generated, using this name as part of the tag.
+    name: (Optional.) A name for queue.
+
+  Returns:
+    A queue with the output rows.  A `QueueRunner` for the queue is
+    added to the current `QUEUE_RUNNER` collection of the current
+    graph.
+
+  Raises:
+    ValueError: If the shape of the input cannot be inferred from the arguments.
+  """
+  with ops.op_scope([input_tensor], name, "input_producer"):
+    input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
+    element_shape = input_tensor.get_shape()[1:].merge_with(element_shape)
+    if not element_shape.is_fully_defined():
+      raise ValueError("Either `input_tensor` must have a fully defined shape "
+                       "or `element_shape` must be specified")
+
+    if shuffle:
+      input_tensor = random_ops.random_shuffle(input_tensor, seed=seed)
+
+    input_tensor = limit_epochs(input_tensor, num_epochs)
+
+    q = data_flow_ops.FIFOQueue(capacity=capacity,
+                                dtypes=[input_tensor.dtype.base_dtype],
+                                shapes=[element_shape],
+                                shared_name=shared_name, name=name)
+    enq = q.enqueue_many([input_tensor])
+    queue_runner.add_queue_runner(queue_runner.QueueRunner(q, [enq]))
+    if summary_name is not None:
+      logging_ops.scalar_summary("queue/%s/%s" % (q.name, summary_name),
+                                 math_ops.cast(q.size(), dtypes.float32) *
+                                 (1. / capacity))
+    return q


 def string_input_producer(string_tensor, num_epochs=None, shuffle=True,
@ -108,9 +151,9 @@ def string_input_producer(string_tensor, num_epochs=None, shuffle=True,
    string_tensor: A 1-D string tensor with the strings to produce.
    num_epochs: An integer (optional). If specified, `string_input_producer`
      produces each string from `string_tensor` `num_epochs` times before
-      generating an OutOfRange error. If not specified, `string_input_producer`
-      can cycle through the strings in `string_tensor` an unlimited number of
-      times.
+      generating an `OutOfRange` error. If not specified,
+      `string_input_producer` can cycle through the strings in `string_tensor`
+      an unlimited number of times.
    shuffle: Boolean. If true, the strings are randomly shuffled within each
      epoch.
    seed: An integer (optional). Seed used if shuffle == True.
@ -137,9 +180,9 @@ def string_input_producer(string_tensor, num_epochs=None, shuffle=True,
        logging_ops.Assert(math_ops.greater(array_ops.size(string_tensor), 0),
                           [not_null_err])]):
      string_tensor = array_ops.identity(string_tensor)
-    return _input_producer(
+    return input_producer(
        input_tensor=string_tensor,
-        dtype=dtypes.string,
+        element_shape=[],
        num_epochs=num_epochs,
        shuffle=shuffle,
        seed=seed,
@ -173,8 +216,8 @@ def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
  """
  with ops.op_scope([limit], name, "input_producer") as name:
    range_tensor = math_ops.range(limit)
-    return _input_producer(
-        range_tensor, dtypes.int32, num_epochs, shuffle, seed, capacity,
+    return input_producer(
+        range_tensor, [], num_epochs, shuffle, seed, capacity,
        shared_name, name, "fraction_of_%d_full" % capacity)


@ -231,51 +274,104 @@ def _flatten(tensor_list_list):
  return [tensor for tensor_list in tensor_list_list for tensor in tensor_list]


+class _SparseMetaData(object):
+  """Store information about the Tensor: Is it sparse?, dtype, and rank."""
+
+  def __init__(self, sparse, dtype, rank):
+    self._sparse = sparse
+    self._dtype = dtype
+    self._rank = rank
+
+  def __eq__(self, other):
+    if self.sparse != other.sparse:
+      return False
+    if not self.sparse:
+      return True
+    if self.dtype != other.dtype:
+      return False
+    if not self.rank.is_compatible_with(other.rank):
+      return False
+    return True
+
+  def __ne__(self, other):
+    return not self.__eq__(other)
+
+  def __str__(self):
+    return "[SparseMetaData(%s, %s, %s)]" % (self.sparse, self.dtype, self.rank)
+
+  def merge_with(self, other):
+    if self != other:
+      raise ValueError("SparseMetaData objects are incompatible: %s vs. %s"
+                       % (self, other))
+    if self.sparse:
+      self.rank.merge_with(other.rank)
+    return self
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def sparse(self):
+    return self._sparse
+
+  @property
+  def rank(self):
+    return self._rank
+
+
 def _serialize_sparse_tensors(tensor_list, enqueue_many):
  """Serialize SparseTensors for feeding into batch, etc."""
-  is_sparse_list = [isinstance(t, ops.SparseTensor) for t in tensor_list]
-  sparse_dtypes_list = [
-      t.dtype if isinstance(t, ops.SparseTensor) else None
+  sparse_info_list = [
+      _SparseMetaData(sparse=True,
+                      dtype=t.dtype,
+                      rank=t.shape.get_shape().with_rank(1)[0])
+      if isinstance(t, ops.SparseTensor)
+      else _SparseMetaData(False, None, None)
      for t in tensor_list]

-  def _maybe_serialize(t, is_sparse):
-    if not is_sparse:
+  def _maybe_serialize(t, sparse):
+    if not sparse:
      return t
    return (sparse_ops.serialize_many_sparse(t) if enqueue_many
            else sparse_ops.serialize_sparse(t))
+
  serialized_list = [
-      _maybe_serialize(t, is_sparse)
-      for (t, is_sparse) in zip(tensor_list, is_sparse_list)]
-  return serialized_list, is_sparse_list, sparse_dtypes_list
+      _maybe_serialize(t, info.sparse) for (t, info)
+      in zip(tensor_list, sparse_info_list)]
+
+  return serialized_list, sparse_info_list


 def _serialize_sparse_tensors_join(tensor_list_list, enqueue_many):
  """Serialize SparseTensors for feeding into batch_join, etc."""
-  (s0, is_sparse_list, sparse_dtypes_list) = _serialize_sparse_tensors(
+  (s0, sparse_info_list) = _serialize_sparse_tensors(
      tensor_list_list[0], enqueue_many)
  serialized_list_list = [s0]
  for tensor_list in tensor_list_list[1:]:
-    (s, is_sparse_candidate, sparse_dtypes_candidate) = (
-        _serialize_sparse_tensors(tensor_list, enqueue_many))
-    if is_sparse_candidate != is_sparse_list:
+    s, sparse_info_candidate = _serialize_sparse_tensors(
+        tensor_list, enqueue_many)
+    if sparse_info_list != sparse_info_candidate:
      raise ValueError("Inconsistent SparseTensors list: %s vs. %s"
                       % (tensor_list_list[0], tensor_list))
-    if sparse_dtypes_candidate != sparse_dtypes_list:
-      raise ValueError("Inconsistent SparseTensor dtypes in list: %s vs. %s"
-                       % (tensor_list_list[0], tensor_list))
+    sparse_info_list = [
+        info.merge_with(candidate)
+        for (info, candidate) in zip(sparse_info_list, sparse_info_candidate)]
    serialized_list_list.append(s)
-  return (serialized_list_list, is_sparse_list, sparse_dtypes_list)
+
+  return (serialized_list_list, sparse_info_list)


-def _deserialize_sparse_tensors(serialized_list, is_sparse_list, sparse_dtypes):
+def _deserialize_sparse_tensors(serialized_list, sparse_info_list):
  """Deserialize SparseTensors after dequeue in batch, batch_join, etc."""
  received_sequence = isinstance(serialized_list, collections.Sequence)
  if not received_sequence:
    serialized_list = (serialized_list,)
-  tensors = [sparse_ops.deserialize_many_sparse(s, sparse_dtype) if is_sparse
-             else s
-             for (s, is_sparse, sparse_dtype)
-             in zip(serialized_list, is_sparse_list, sparse_dtypes)]
+  tensors = [
+      sparse_ops.deserialize_many_sparse(s, info.dtype, info.rank.value)
+      if info.sparse else s
+      for (s, info)
+      in zip(serialized_list, sparse_info_list)]
  return tensors if received_sequence else tensors[0]


@ -345,7 +441,8 @@ def _enqueue(queue, tensor_list, threads, enqueue_many):


 def batch(tensor_list, batch_size, num_threads=1, capacity=32,
-          enqueue_many=False, shapes=None, shared_name=None, name=None):
+          enqueue_many=False, shapes=None,
+          shared_name=None, name=None):
  """Creates batches of tensors in `tensor_list`.

  This function is implemented using a queue. A `QueueRunner` for the
@ -394,7 +491,7 @@ def batch(tensor_list, batch_size, num_threads=1, capacity=32,
  """
  with ops.op_scope(tensor_list, name, "batch") as name:
    tensor_list = _validate(tensor_list)
-    tensor_list, is_sparse, sparse_dtypes = _serialize_sparse_tensors(
+    (tensor_list, sparse_info) = _serialize_sparse_tensors(
        tensor_list, enqueue_many)
    types = _dtypes([tensor_list])
    shapes = _shapes([tensor_list], shapes, enqueue_many)
@ -407,7 +504,7 @@ def batch(tensor_list, batch_size, num_threads=1, capacity=32,
        math_ops.cast(queue.size(), dtypes.float32) * (1. / capacity))

    dequeued = queue.dequeue_many(batch_size, name=name)
-    dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes)
+    dequeued = _deserialize_sparse_tensors(dequeued, sparse_info)
    return dequeued


@ -478,8 +575,8 @@ def batch_join(tensor_list_list, batch_size, capacity=32, enqueue_many=False,
  """
  with ops.op_scope(_flatten(tensor_list_list), name, "batch_join") as name:
    tensor_list_list = _validate_join(tensor_list_list)
-    tensor_list_list, is_sparse, sparse_dtypes = (
-        _serialize_sparse_tensors_join(tensor_list_list, enqueue_many))
+    tensor_list_list, sparse_info = _serialize_sparse_tensors_join(
+        tensor_list_list, enqueue_many)
    types = _dtypes(tensor_list_list)
    shapes = _shapes(tensor_list_list, shapes, enqueue_many)
    # TODO(josh11b,mrry): Switch to BatchQueue once it is written.
@ -491,7 +588,7 @@ def batch_join(tensor_list_list, batch_size, capacity=32, enqueue_many=False,
        math_ops.cast(queue.size(), dtypes.float32) * (1. / capacity))

    dequeued = queue.dequeue_many(batch_size, name=name)
-    dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes)
+    dequeued = _deserialize_sparse_tensors(dequeued, sparse_info)
    return dequeued


@ -567,7 +664,7 @@ def shuffle_batch(tensor_list, batch_size, capacity, min_after_dequeue,
  """
  with ops.op_scope(tensor_list, name, "shuffle_batch") as name:
    tensor_list = _validate(tensor_list)
-    tensor_list, is_sparse, sparse_dtypes = _serialize_sparse_tensors(
+    tensor_list, sparse_info = _serialize_sparse_tensors(
        tensor_list, enqueue_many)
    types = _dtypes([tensor_list])
    shapes = _shapes([tensor_list], shapes, enqueue_many)
@ -586,7 +683,7 @@ def shuffle_batch(tensor_list, batch_size, capacity, min_after_dequeue,
    logging_ops.scalar_summary(summary_name, full)

    dequeued = queue.dequeue_many(batch_size, name=name)
-    dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes)
+    dequeued = _deserialize_sparse_tensors(dequeued, sparse_info)
    return dequeued


@ -652,8 +749,8 @@ def shuffle_batch_join(tensor_list_list, batch_size, capacity,
  with ops.op_scope(
      _flatten(tensor_list_list), name, "shuffle_batch_join") as name:
    tensor_list_list = _validate_join(tensor_list_list)
-    tensor_list_list, is_sparse, sparse_dtypes = (
-        _serialize_sparse_tensors_join(tensor_list_list, enqueue_many))
+    tensor_list_list, sparse_info = _serialize_sparse_tensors_join(
+        tensor_list_list, enqueue_many)
    types = _dtypes(tensor_list_list)
    shapes = _shapes(tensor_list_list, shapes, enqueue_many)
    queue = data_flow_ops.RandomShuffleQueue(
@ -671,5 +768,5 @@ def shuffle_batch_join(tensor_list_list, batch_size, capacity,
    logging_ops.scalar_summary(summary_name, full)

    dequeued = queue.dequeue_many(batch_size, name=name)
-    dequeued = _deserialize_sparse_tensors(dequeued, is_sparse, sparse_dtypes)
+    dequeued = _deserialize_sparse_tensors(dequeued, sparse_info)
    return dequeued
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@ -69,6 +69,60 @@ class LimitEpochsTest(tf.test.TestCase):
        love_me_two_times.eval()


+class InputProducerTest(tf.test.TestCase):
+
+  def testNoShuffle(self):
+    with self.test_session():
+      input_tensor = [[1, 2, 3, 4],
+                      [5, 6, 7, 8],
+                      [9, 10, 11, 12]]
+      num_epochs = 2
+      queue = tf.train.input_producer(
+          input_tensor, num_epochs=num_epochs, shuffle=False)
+      dequeue_many = queue.dequeue_many(len(input_tensor) * num_epochs)
+      dequeue = queue.dequeue()
+      tf.initialize_all_variables().run()
+      threads = tf.train.start_queue_runners()
+
+      # No randomness, so just see repeated copies of the input.
+      self.assertAllEqual(input_tensor * num_epochs, dequeue_many.eval())
+
+      # Reached the limit.
+      with self.assertRaises(tf.errors.OutOfRangeError):
+        dequeue.eval()
+      for thread in threads:
+        thread.join()
+
+  def testNoShapeInference(self):
+    with self.test_session():
+      # Disable shape inference for the input.
+      input_value = [[1, 2, 3, 4],
+                     [5, 6, 7, 8],
+                     [9, 10, 11, 12]]
+      input_tensor = tf.placeholder_with_default(input_value, shape=None)
+      num_epochs = 2
+      queue = tf.train.input_producer(
+          input_tensor, element_shape=[4], num_epochs=num_epochs, shuffle=False)
+      dequeue_many = queue.dequeue_many(len(input_value) * num_epochs)
+      dequeue = queue.dequeue()
+      tf.initialize_all_variables().run()
+      threads = tf.train.start_queue_runners()
+
+      # No randomness, so just see repeated copies of the input.
+      self.assertAllEqual(input_value * num_epochs, dequeue_many.eval())
+
+      # Reached the limit.
+      with self.assertRaises(tf.errors.OutOfRangeError):
+        dequeue.eval()
+      for thread in threads:
+        thread.join()
+
+  def testShapeError(self):
+    input_tensor = tf.placeholder(tf.float32, None)
+    with self.assertRaisesRegexp(ValueError, "fully defined shape"):
+      _ = tf.train.input_producer(input_tensor)
+
+
 class StringInputProducerTest(tf.test.TestCase):

  def testNoShuffle(self):
--- a/tensorflow/python/training/summary_io.py
+++ b/tensorflow/python/training/summary_io.py
@ -25,11 +25,14 @@ import time

 import six

+from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import summary_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.platform import gfile
+from tensorflow.python.platform import logging
 from tensorflow.python.util import compat


@ -53,7 +56,8 @@ class SummaryWriter(object):
  @@close
  """

-  def __init__(self, logdir, graph_def=None, max_queue=10, flush_secs=120):
+  def __init__(self, logdir, graph=None, max_queue=10, flush_secs=120,
+               graph_def=None):
    """Creates a `SummaryWriter` and an event file.

    On construction the summary writer creates a new event file in `logdir`.
@ -61,7 +65,7 @@ class SummaryWriter(object):
    call one of the following functions: `add_summary()`, `add_session_log()`,
    `add_event()`, or `add_graph()`.

-    If you pass a `graph_def` protocol buffer to the constructor it is added to
+    If you pass a `Graph` to the constructor it is added to
    the event file. (This is equivalent to calling `add_graph()` later).

    TensorBoard will pick the graph from the file and display it graphically so
@ -72,8 +76,8 @@ class SummaryWriter(object):
    ...create a graph...
    # Launch the graph in a session.
    sess = tf.Session()
-    # Create a summary writer, add the 'graph_def' to the event file.
-    writer = tf.train.SummaryWriter(<some-directory>, sess.graph_def)
+    # Create a summary writer, add the 'graph' to the event file.
+    writer = tf.train.SummaryWriter(<some-directory>, sess.graph)
    ```

    The other arguments to the constructor control the asynchronous writes to
@ -86,10 +90,11 @@ class SummaryWriter(object):

    Args:
      logdir: A string. Directory where event file will be written.
-      graph_def: A `GraphDef` protocol buffer.
+      graph: A `Graph` object, such as `sess.graph`.
      max_queue: Integer. Size of the queue for pending events and summaries.
      flush_secs: Number. How often, in seconds, to flush the
        pending events and summaries to disk.
+      graph_def: DEPRECATED: Use the `graph` argument instead.
    """
    self._logdir = logdir
    if not gfile.IsDirectory(self._logdir):
@ -100,8 +105,9 @@ class SummaryWriter(object):
    self._worker = _EventLoggerThread(self._event_queue, self._ev_writer,
                                      flush_secs)
    self._worker.start()
-    if graph_def is not None:
-      self.add_graph(graph_def)
+    if graph is not None or graph_def is not None:
+      # Calling it with both graph and graph_def for backward compatibility.
+      self.add_graph(graph=graph, graph_def=graph_def)

  def add_summary(self, summary, global_step=None):
    """Adds a `Summary` protocol buffer to the event file.
@ -154,23 +160,65 @@ class SummaryWriter(object):
    """
    self._event_queue.put(event)

-  def add_graph(self, graph_def, global_step=None):
-    """Adds a `GraphDef` protocol buffer to the event file.
-
-    The graph described by the protocol buffer will be displayed by
-    TensorBoard. Most users pass a graph in the constructor instead.
-
-    Args:
-      graph_def: A `GraphDef` protocol buffer.
-      global_step: Number. Optional global step counter to record with the
-        graph.
-    """
+  def _add_graph_def(self, graph_def, global_step=None):
    graph_bytes = graph_def.SerializeToString()
    event = event_pb2.Event(wall_time=time.time(), graph_def=graph_bytes)
    if global_step is not None:
      event.step = int(global_step)
    self._event_queue.put(event)

+  def add_graph(self, graph, global_step=None, graph_def=None):
+    """Adds a `Graph` to the event file.
+
+    The graph described by the protocol buffer will be displayed by
+    TensorBoard. Most users pass a graph in the constructor instead.
+
+    Args:
+      graph: A `Graph` object, such as `sess.graph`.
+      global_step: Number. Optional global step counter to record with the
+        graph.
+      graph_def: DEPRECATED. Use the `graph` parameter instead.
+
+    Raises:
+      ValueError: If both graph and graph_def are passed to the method.
+    """
+
+    if graph is not None and graph_def is not None:
+      raise ValueError("Please pass only graph, or graph_def (deprecated), "
+                       "but not both.")
+
+    if isinstance(graph, ops.Graph) or isinstance(graph_def, ops.Graph):
+      # The user passed a `Graph`.
+
+      # Check if the user passed it via the graph or the graph_def argument and
+      # correct for that.
+      if not isinstance(graph, ops.Graph):
+        logging.warning("When passing a `Graph` object, please use the `graph`"
+                        " named argument instead of `graph_def`.")
+        graph = graph_def
+
+      # Serialize the graph with additional info.
+      true_graph_def = graph.as_graph_def(add_shapes=True)
+    elif (isinstance(graph, graph_pb2.GraphDef)
+          or isinstance(graph_def, graph_pb2.GraphDef)):
+      # The user passed a `GraphDef`.
+      logging.warning("Passing a `GraphDef` to the SummaryWriter is deprecated."
+                      " Pass a `Graph` object instead, such as `sess.graph`.")
+
+      # Check if the user passed it via the graph or the graph_def argument and
+      # correct for that.
+      if isinstance(graph, graph_pb2.GraphDef):
+        true_graph_def = graph
+      else:
+        true_graph_def = graph_def
+
+    else:
+      # The user passed neither `Graph`, nor `GraphDef`.
+      raise TypeError("The passed graph must be an instance of `Graph` "
+                      "or the deprecated `GraphDef`")
+    # Finally, add the graph_def to the summary writer.
+    self._add_graph_def(true_graph_def, global_step)
+
  def flush(self):
    """Flushes the event file to disk.

--- a/tensorflow/python/training/summary_writer_test.py
+++ b/tensorflow/python/training/summary_writer_test.py
@ -49,6 +49,25 @@ class SummaryWriterTestCase(tf.test.TestCase):
  def _assertRecent(self, t):
    self.assertTrue(abs(t - time.time()) < 5)

+  def _assertEventsWithGraph(self, test_dir, g, has_shapes):
+    rr = self._EventsReader(test_dir)
+
+    # The first event should list the file_version.
+    ev = next(rr)
+    self._assertRecent(ev.wall_time)
+    self.assertEquals("brain.Event:2", ev.file_version)
+
+    # The next event should have the graph.
+    ev = next(rr)
+    self._assertRecent(ev.wall_time)
+    self.assertEquals(0, ev.step)
+    ev_graph = tf.GraphDef()
+    ev_graph.ParseFromString(ev.graph_def)
+    self.assertProtoEquals(g.as_graph_def(add_shapes=has_shapes), ev_graph)
+
+    # We should be done.
+    self.assertRaises(StopIteration, lambda: next(rr))
+
  def testAddingSummaryAndGraph(self):
    test_dir = self._CleanTestDir("basics")
    sw = tf.train.SummaryWriter(test_dir)
@ -105,30 +124,54 @@ class SummaryWriterTestCase(tf.test.TestCase):
    # We should be done.
    self.assertRaises(StopIteration, lambda: next(rr))

-  def testInitializingWithGraphDef(self):
-    test_dir = self._CleanTestDir("basics_with_graph")
+  def testGraphAsNamed(self):
+    test_dir = self._CleanTestDir("basics_named_graph")
+    with tf.Graph().as_default() as g:
+      tf.constant([12], name="douze")
+    sw = tf.train.SummaryWriter(test_dir, graph=g)
+    sw.close()
+    self._assertEventsWithGraph(test_dir, g, True)
+
+  def testGraphAsPositional(self):
+    test_dir = self._CleanTestDir("basics_positional_graph")
+    with tf.Graph().as_default() as g:
+      tf.constant([12], name="douze")
+    sw = tf.train.SummaryWriter(test_dir, g)
+    sw.close()
+    self._assertEventsWithGraph(test_dir, g, True)
+
+  def testGraphDefAsNamed(self):
+    test_dir = self._CleanTestDir("basics_named_graph_def")
    with tf.Graph().as_default() as g:
      tf.constant([12], name="douze")
    gd = g.as_graph_def()
    sw = tf.train.SummaryWriter(test_dir, graph_def=gd)
    sw.close()
-    rr = self._EventsReader(test_dir)
+    self._assertEventsWithGraph(test_dir, g, False)

-    # The first event should list the file_version.
-    ev = next(rr)
-    self._assertRecent(ev.wall_time)
-    self.assertEquals("brain.Event:2", ev.file_version)
+  def testGraphDefAsPositional(self):
+    test_dir = self._CleanTestDir("basics_positional_graph_def")
+    with tf.Graph().as_default() as g:
+      tf.constant([12], name="douze")
+    gd = g.as_graph_def()
+    sw = tf.train.SummaryWriter(test_dir, gd)
+    sw.close()
+    self._assertEventsWithGraph(test_dir, g, False)

-    # The next event should have the graph.
-    ev = next(rr)
-    self._assertRecent(ev.wall_time)
-    self.assertEquals(0, ev.step)
-    ev_graph = tf.GraphDef()
-    ev_graph.ParseFromString(ev.graph_def)
-    self.assertProtoEquals(gd, ev_graph)
+  def testGraphAndGraphDef(self):
+    with self.assertRaises(ValueError):
+      test_dir = self._CleanTestDir("basics_graph_and_graph_def")
+      with tf.Graph().as_default() as g:
+        tf.constant([12], name="douze")
+      gd = g.as_graph_def()
+      sw = tf.train.SummaryWriter(test_dir, graph=g, graph_def=gd)
+      sw.close()

-    # We should be done.
-    self.assertRaises(StopIteration, lambda: next(rr))
+  def testNeitherGraphNorGraphDef(self):
+    with self.assertRaises(TypeError):
+      test_dir = self._CleanTestDir("basics_string_instead_of_graph")
+      sw = tf.train.SummaryWriter(test_dir, "string instead of graph object")
+      sw.close()

  # Checks that values returned from session Run() calls are added correctly to
  # summaries.  These are numpy types so we need to check they fit in the
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@ -844,7 +844,7 @@ class SVSummaryThread(coordinator.LooperThread):
    self._sess = sess

  def run_loop(self):
-    if self._sv.global_step:
+    if self._sv.global_step is not None:
      summary_strs, global_step = self._sess.run([self._sv.summary_op,
                                                  self._sv.global_step])
    else:
@ -912,7 +912,7 @@ class SVTimerCheckpointThread(coordinator.LooperThread):
  def run_loop(self):
    self._sv.saver.save(self._sess, self._sv.save_path,
                        global_step=self._sv.global_step)
-    if self._sv.summary_writer and self._sv.global_step:
+    if self._sv.summary_writer and self._sv.global_step is not None:
      current_step = training_util.global_step(self._sess, self._sv.global_step)
      self._sv.summary_writer.add_session_log(
          SessionLog(status=SessionLog.CHECKPOINT,
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@ -50,6 +50,7 @@ namespace perftools {
 namespace gputools {

 class Stream;
+class ScratchAllocator;

 template <typename ElemT>
 class DeviceMemory;
@ -880,14 +881,14 @@ class BlasSupport {
      const port::ArraySlice<DeviceMemory<float> *> &a, int lda,
      const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta,
      const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
-      int batch_count) = 0;
+      int batch_count, ScratchAllocator *scratch_allocator) = 0;
  virtual bool DoBlasGemmBatched(
      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
      uint64 n, uint64 k, double alpha,
      const port::ArraySlice<DeviceMemory<double> *> &a, int lda,
      const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta,
      const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,
-      int batch_count) = 0;
+      int batch_count, ScratchAllocator *scratch_allocator) = 0;
  virtual bool DoBlasGemmBatched(
      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
      uint64 n, uint64 k, std::complex<float> alpha,
@ -895,7 +896,7 @@ class BlasSupport {
      const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb,
      std::complex<float> beta,
      const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc,
-      int batch_count) = 0;
+      int batch_count, ScratchAllocator *scratch_allocator) = 0;
  virtual bool DoBlasGemmBatched(
      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
      uint64 n, uint64 k, std::complex<double> alpha,
@ -903,7 +904,7 @@ class BlasSupport {
      const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb,
      std::complex<double> beta,
      const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
-      int batch_count) = 0;
+      int batch_count, ScratchAllocator *scratch_allocator) = 0;

  // Computes a matrix-matrix product where one input matrix is Hermitian:
  //
@ -1140,7 +1141,7 @@ class BlasSupport {

 // Macro used to quickly declare overrides for abstract virtuals in the
 // BlasSupport base class.
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES                 \
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES                  \
  bool DoBlasAsum(Stream *stream, uint64 elem_count,                           \
                  const DeviceMemory<float> &x, int incx,                      \
                  DeviceMemory<float> *result) override;                       \
@ -1626,14 +1627,14 @@ class BlasSupport {
      const port::ArraySlice<DeviceMemory<float> *> &a, int lda,               \
      const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta,   \
      const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,               \
-      int batch_count) override;                                               \
+      int batch_count, ScratchAllocator *scratch_allocator) override;          \
  bool DoBlasGemmBatched(                                                      \
      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
      uint64 m, uint64 n, uint64 k, double alpha,                              \
      const port::ArraySlice<DeviceMemory<double> *> &a, int lda,              \
      const port::ArraySlice<DeviceMemory<double> *> &b, int ldb, double beta, \
      const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,              \
-      int batch_count) override;                                               \
+      int batch_count, ScratchAllocator *scratch_allocator) override;          \
  bool DoBlasGemmBatched(                                                      \
      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
      uint64 m, uint64 n, uint64 k, std::complex<float> alpha,                 \
@ -1641,7 +1642,7 @@ class BlasSupport {
      const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb, \
      std::complex<float> beta,                                                \
      const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc, \
-      int batch_count) override;                                               \
+      int batch_count, ScratchAllocator *scratch_allocator) override;          \
  bool DoBlasGemmBatched(                                                      \
      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
      uint64 m, uint64 n, uint64 k, std::complex<double> alpha,                \
@ -1650,7 +1651,7 @@ class BlasSupport {
      const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b,         \
      int ldb, std::complex<double> beta,                                      \
      const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c,         \
-      int ldc, int batch_count) override;                                      \
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \
  bool DoBlasHemm(Stream *stream, blas::Side side, blas::UpperLower uplo,      \
                  uint64 m, uint64 n, std::complex<float> alpha,               \
                  const DeviceMemory<std::complex<float>> &a, int lda,         \
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@ -19,6 +19,7 @@ limitations under the License.

 #include <complex>

+#include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 #include "tensorflow/stream_executor/cuda/cuda_helpers.h"
@ -34,8 +35,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"
-#include "third_party/gpus/cuda/include/cublas_v2.h"

 namespace perftools {
 namespace gputools {
@ -1707,37 +1708,64 @@ template <typename T, typename FuncT>
 port::Status CUDABlas::DoBlasGemmBatchedInternal(
    FuncT cublas_func, Stream *stream, blas::Transpose transa,
    blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
-    const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
-    const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
-    const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
-    int batch_count) {
-  std::vector<T *> a_ptr_vec, b_ptr_vec, c_ptr_vec;
+    const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
+    const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
+    T beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
  for (int i = 0; i < batch_count; ++i) {
-    a_ptr_vec.push_back(static_cast<T *>(a_array[i]->opaque()));
-    b_ptr_vec.push_back(static_cast<T *>(b_array[i]->opaque()));
-    c_ptr_vec.push_back(static_cast<T *>(c_array[i]->opaque()));
+    a_raw_ptrs.push_back(static_cast<T *>(a_ptrs_to_wrappers[i]->opaque()));
+    b_raw_ptrs.push_back(static_cast<T *>(b_ptrs_to_wrappers[i]->opaque()));
+    c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
  }

  typedef typename CUDAComplexT<T>::type CUDA_T;
-  SE_ASSIGN_OR_RETURN(
-      std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_ptr_array,
-      stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
-  SE_ASSIGN_OR_RETURN(
-      std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_ptr_array,
-      stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
-  SE_ASSIGN_OR_RETURN(
-      std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_ptr_array,
-      stream->AllocateTemporaryArray<CUDA_T *>(batch_count));

-  if (!stream->ThenMemcpy(a_ptr_array->mutable_device_memory(),
-                          a_ptr_vec.data(), batch_count * sizeof(T *))
-           .ok() ||
-      !stream->ThenMemcpy(b_ptr_array->mutable_device_memory(),
-                          b_ptr_vec.data(), batch_count * sizeof(T *))
-           .ok() ||
-      !stream->ThenMemcpy(c_ptr_array->mutable_device_memory(),
-                          c_ptr_vec.data(), batch_count * sizeof(T *))
-           .ok()) {
+  const size_t size = batch_count * sizeof(CUDA_T *);
+
+  // Device-side copy of pointers to matrices.
+  DeviceMemory<CUDA_T *> a;
+  DeviceMemory<CUDA_T *> b;
+  DeviceMemory<CUDA_T *> c;
+
+  // If temporary space is allocated for device-side copies of pointers to
+  // matrices, that temporary space should not be freed until this function
+  // returns. Although the values for these unique_ptrs are not set here, they
+  // are declared at this scope so they will be destroyed when the function
+  // returns.
+  //
+  // If a scratch allocator is provided, these pointers will not be used at all.
+  std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_temporary;
+  std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_temporary;
+  std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_temporary;
+
+  // Decide how to allocate device-side copy of pointers to matrices based on
+  // whether a scratch allocator was passed.
+  if (scratch_allocator != nullptr) {
+    SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> a_bytes,
+                        scratch_allocator->AllocateBytes(stream, size));
+    SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> b_bytes,
+                        scratch_allocator->AllocateBytes(stream, size));
+    SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> c_bytes,
+                        scratch_allocator->AllocateBytes(stream, size));
+    a = DeviceMemory<CUDA_T *>(a_bytes);
+    b = DeviceMemory<CUDA_T *>(b_bytes);
+    c = DeviceMemory<CUDA_T *>(c_bytes);
+  } else {
+    SE_ASSIGN_OR_RETURN(a_temporary,
+                        stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
+    SE_ASSIGN_OR_RETURN(b_temporary,
+                        stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
+    SE_ASSIGN_OR_RETURN(c_temporary,
+                        stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
+    a = DeviceMemory<CUDA_T *>(*a_temporary->mutable_device_memory());
+    b = DeviceMemory<CUDA_T *>(*b_temporary->mutable_device_memory());
+    c = DeviceMemory<CUDA_T *>(*c_temporary->mutable_device_memory());
+  }
+
+  if (!stream->ThenMemcpy(&a, a_raw_ptrs.data(), size).ok() ||
+      !stream->ThenMemcpy(&b, b_raw_ptrs.data(), size).ok() ||
+      !stream->ThenMemcpy(&c, c_raw_ptrs.data(), size).ok()) {
    return port::Status(port::error::INTERNAL,
                        "failed to copy memory from host to device in "
                        "CUDABlas::DoBlasGemmBatched");
@ -1746,13 +1774,9 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
  bool ok = DoBlasInternal(
      cublas_func, stream, true /* = pointer_mode_host */,
      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha),
-      const_cast<const CUDA_T **>(CUDAMemory(a_ptr_array->device_memory())),
-      lda,
-      const_cast<const CUDA_T **>(CUDAMemory(b_ptr_array->device_memory())),
-      ldb, CUDAComplex(&beta),
-      const_cast<CUDA_T **>(CUDAMemory(c_ptr_array->device_memory())), ldc,
-      batch_count);
+      CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
+      const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
+      const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);

  if (ok) {
    return port::Status::OK();
@ -1767,10 +1791,11 @@ bool CUDABlas::DoBlasGemmBatched(
    const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda,
    const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
    const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
-    int batch_count) {
+    int batch_count, ScratchAllocator *scratch_allocator) {
  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
      dynload::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha,
-      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
+      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
+      scratch_allocator));
 }

 bool CUDABlas::DoBlasGemmBatched(
@ -1779,10 +1804,11 @@ bool CUDABlas::DoBlasGemmBatched(
    const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda,
    const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
    double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
-    int ldc, int batch_count) {
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
      dynload::cublasDgemmBatched, stream, transa, transb, m, n, k, alpha,
-      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
+      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
+      scratch_allocator));
 }

 bool CUDABlas::DoBlasGemmBatched(
@ -1793,10 +1819,11 @@ bool CUDABlas::DoBlasGemmBatched(
    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array,
    int ldb, std::complex<float> beta,
    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
-    int ldc, int batch_count) {
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
      dynload::cublasCgemmBatched, stream, transa, transb, m, n, k, alpha,
-      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
+      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
+      scratch_allocator));
 }

 bool CUDABlas::DoBlasGemmBatched(
@ -1807,10 +1834,11 @@ bool CUDABlas::DoBlasGemmBatched(
    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array,
    int ldb, std::complex<double> beta,
    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
-    int ldc, int batch_count) {
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
  SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
      dynload::cublasZgemmBatched, stream, transa, transb, m, n, k, alpha,
-      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
+      a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
+      scratch_allocator));
 }

 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@ -93,7 +93,7 @@ class CUDABlas : public blas::BlasSupport {
      const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
      const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
-      int batch_count);
+      int batch_count, ScratchAllocator *scratch_allocator);

  // mutex that guards the cuBLAS handle for this device.
  mutex mu_;
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@ -2986,6 +2986,17 @@ Stream &Stream::ThenBlasGemmBatched(
    int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,
    float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
    int batch_count) {
+  return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
+                                        b, ldb, beta, c, ldc, batch_count,
+                                        nullptr);
+}
+
+Stream &Stream::ThenBlasGemmBatchedWithScratch(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
+    int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,
+    float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
@ -2993,9 +3004,12 @@ Stream &Stream::ThenBlasGemmBatched(
  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
               const port::ArraySlice<DeviceMemory<float> *> &, int,
               const port::ArraySlice<DeviceMemory<float> *> &, int, float,
-               const port::ArraySlice<DeviceMemory<float> *> &, int, int> impl;
+               const port::ArraySlice<DeviceMemory<float> *> &, int, int,
+               ScratchAllocator *>
+      impl;
  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
-              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count);
+              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
+              scratch_allocator);
 }

 Stream &Stream::ThenBlasGemmBatched(
@ -3004,6 +3018,17 @@ Stream &Stream::ThenBlasGemmBatched(
    int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb,
    double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,
    int batch_count) {
+  return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
+                                        b, ldb, beta, c, ldc, batch_count,
+                                        nullptr);
+}
+
+Stream &Stream::ThenBlasGemmBatchedWithScratch(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, double alpha, const port::ArraySlice<DeviceMemory<double> *> &a,
+    int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb,
+    double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
@ -3011,9 +3036,12 @@ Stream &Stream::ThenBlasGemmBatched(
  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, double,
               const port::ArraySlice<DeviceMemory<double> *> &, int,
               const port::ArraySlice<DeviceMemory<double> *> &, int, double,
-               const port::ArraySlice<DeviceMemory<double> *> &, int, int> impl;
+               const port::ArraySlice<DeviceMemory<double> *> &, int, int,
+               ScratchAllocator *>
+      impl;
  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
-              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count);
+              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
+              scratch_allocator);
 }

 Stream &Stream::ThenBlasGemmBatched(
@ -3024,6 +3052,19 @@ Stream &Stream::ThenBlasGemmBatched(
    std::complex<float> beta,
    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc,
    int batch_count) {
+  return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
+                                        b, ldb, beta, c, ldc, batch_count,
+                                        nullptr);
+}
+
+Stream &Stream::ThenBlasGemmBatchedWithScratch(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, std::complex<float> alpha,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb,
+    std::complex<float> beta,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
@ -3035,9 +3076,11 @@ Stream &Stream::ThenBlasGemmBatched(
               const port::ArraySlice<DeviceMemory<std::complex<float>> *> &,
               int, std::complex<float>,
               const port::ArraySlice<DeviceMemory<std::complex<float>> *> &,
-               int, int> impl;
+               int, int, ScratchAllocator *>
+      impl;
  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
-              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count);
+              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
+              scratch_allocator);
 }

 Stream &Stream::ThenBlasGemmBatched(
@ -3048,6 +3091,19 @@ Stream &Stream::ThenBlasGemmBatched(
    std::complex<double> beta,
    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
    int batch_count) {
+  return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
+                                        b, ldb, beta, c, ldc, batch_count,
+                                        nullptr);
+}
+
+Stream &Stream::ThenBlasGemmBatchedWithScratch(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, std::complex<double> alpha,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb,
+    std::complex<double> beta,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
@ -3059,9 +3115,11 @@ Stream &Stream::ThenBlasGemmBatched(
               const port::ArraySlice<DeviceMemory<std::complex<double>> *> &,
               int, std::complex<double>,
               const port::ArraySlice<DeviceMemory<std::complex<double>> *> &,
-               int, int> impl;
+               int, int, ScratchAllocator *>
+      impl;
  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
-              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count);
+              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
+              scratch_allocator);
 }

 Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) {
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@ -944,6 +944,34 @@ class Stream {
      std::complex<double> beta,
      const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
      int batch_count);
+  Stream &ThenBlasGemmBatchedWithScratch(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
+      int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
+      int batch_count, ScratchAllocator *scratch_allocator);
+  Stream &ThenBlasGemmBatchedWithScratch(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, double alpha, const port::ArraySlice<DeviceMemory<double> *> &a,
+      int lda, const port::ArraySlice<DeviceMemory<double> *> &b, int ldb,
+      double beta, const port::ArraySlice<DeviceMemory<double> *> &c, int ldc,
+      int batch_count, ScratchAllocator *scratch_allocator);
+  Stream &ThenBlasGemmBatchedWithScratch(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, std::complex<float> alpha,
+      const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b, int ldb,
+      std::complex<float> beta,
+      const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c, int ldc,
+      int batch_count, ScratchAllocator *scratch_allocator);
+  Stream &ThenBlasGemmBatchedWithScratch(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, std::complex<double> alpha,
+      const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b, int ldb,
+      std::complex<double> beta,
+      const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
+      int batch_count, ScratchAllocator *scratch_allocator);

  // See BlasSupport::DoBlasHemm.
  Stream &ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m,
--- a/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html
+++ b/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html
@ -115,7 +115,7 @@ The #center div contains tf-charts embedded inside tf-collapsable-panes.
            <p>
              Maybe data hasn't loaded yet, or maybe you need
              to add some <code>tf.scalar_summary</code> ops to your graph, and
-              serialize them using the <code>tf.training.summary_io.SummaryWriter</code>.
+              serialize them using the <code>tf.train.SummaryWriter</code>.
            </p>
          </div>
        </template>
--- a/tensorflow/tensorboard/components/tf-event-dashboard/tf-run-selector.html
+++ b/tensorflow/tensorboard/components/tf-event-dashboard/tf-run-selector.html
@ -75,7 +75,6 @@ Properties out:
        display: flex;
        flex-grow: 1;
        flex-shrink: 1;
-        height: 0px; /* hackhack So the flex-grow takes over and gives it space */
      }
      .x-button {
        font-size: 13px;
--- a/tensorflow/tensorboard/components/tf-graph-common/lib/hierarchy.ts
+++ b/tensorflow/tensorboard/components/tf-graph-common/lib/hierarchy.ts
@ -515,6 +515,13 @@ function addEdges(h: Hierarchy, graph: SlimGraph,
    let sourceAncestorIndex = getPath(graph.nodes[baseEdge.v], sourcePath);
    let destAncestorIndex = getPath(graph.nodes[baseEdge.w], destPath);

+    // If the hierarchical path cannot be found for either endpoint, then we
+    // cannot create the edge. This happens for example when a node has a
+    // control dependency on a summary node, which are embedded.
+    if (sourceAncestorIndex === -1 || destAncestorIndex === -1) {
+      return;
+    }
+
    // Find the lowest shared ancestor between source and dest by looking for
    // the highest nodes that differ between their ancestor paths.
    while (sourcePath[sourceAncestorIndex] === destPath[destAncestorIndex]) {
--- a/tensorflow/tensorboard/components/tf-graph-common/lib/layout.ts
+++ b/tensorflow/tensorboard/components/tf-graph-common/lib/layout.ts
@ -87,7 +87,7 @@ export const PARAMS = {
       */
      labelHeight: 20,
      /** X-space between each extracted node and the core graph. */
-      extractXOffset: 50,
+      extractXOffset: 15,
      /** Y-space between each extracted node. */
      extractYOffset: 20
    },
@ -486,9 +486,24 @@ function layoutMetanode(renderNodeInfo: render.RenderGroupNodeInfo): void {
      return height + yOffset + child.height;
    }, 0);

+  // Compute the total padding between the core graph, in-extract and
+  // out-extract boxes.
+  let numParts = 0;
+  if (renderNodeInfo.isolatedInExtract.length > 0) {
+    numParts++;
+  }
+  if (renderNodeInfo.isolatedOutExtract.length > 0) {
+    numParts++;
+  }
+  if (renderNodeInfo.coreGraph.nodeCount() > 0) {
+    numParts++;
+  }
+  let offset = PARAMS.subscene.meta.extractXOffset;
+  let padding = numParts <= 1 ? 0 : (numParts  <= 2 ? offset : 2 * offset);
+
  // Add the in-extract and out-extract width to the core box width.
  renderNodeInfo.coreBox.width += renderNodeInfo.inExtractBox.width +
-      renderNodeInfo.outExtractBox.width;
+      renderNodeInfo.outExtractBox.width + padding;
  renderNodeInfo.coreBox.height =
    params.labelHeight +
    Math.max(
--- a/tensorflow/tensorboard/components/tf-graph-common/lib/render.ts
+++ b/tensorflow/tensorboard/components/tf-graph-common/lib/render.ts
@ -964,8 +964,6 @@ export class RenderNodeInfo {

  /** Label vertical offset from the center of node shape */
  labelOffset: number;
-  /** X-space between each extracted node and the core graph. */
-  extractXOffset: number;
  /** Rectangle radius (for making rounded rectangle) */
  radius: number;

@ -1027,7 +1025,6 @@ export class RenderNodeInfo {

    // Params for node box.
    this.labelOffset = 0;
-    this.extractXOffset = 0;
    this.radius = 0;

    // Params for expanded node
--- a/Show More
+++ b/Show More