Merge changes from github.

PiperOrigin-RevId: 192850372
2018-04-13 17:52:20 -07:00 · 2018-04-13 17:52:20 -07:00 · 3652556dab
commit 3652556dab
parent ef24ad1450
73 changed files with 1795 additions and 400 deletions
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -450,11 +450,12 @@ tf_cc_shared_object(
    linkstatic = 1,
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/core:framework_internal_impl",
        "//tensorflow/core:lib_internal_impl",
        "//tensorflow/core:core_cpu_impl",
-        "//tensorflow/stream_executor:stream_executor_impl",
+        "//tensorflow/core:framework_internal_impl",
        "//tensorflow/core:gpu_runtime_impl",
        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
        "//tensorflow/core:lib_internal_impl",
        "//tensorflow/stream_executor:stream_executor_impl",
    ] + tf_additional_binary_deps(),
 )
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -318,6 +318,7 @@ cc_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/kernels:bounds_check",
    ],
 )
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/public/version.h"
@ -441,6 +442,9 @@ string DescribeCycle(const GraphCycles& cycles, const Graph& graph, int src,
  }
  auto node_name = [&cycles, &graph](int node_id) {
    if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
      return string("(null)");
    }
    auto* node = graph.FindNodeId(node_id);
    if (node == nullptr) {
      return string("(null)");
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@ -35,6 +35,7 @@ else()
  set(grpc_STATIC_LIBRARIES
      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a
      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
 endif()
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@ -201,7 +201,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
    #An instance of tensorflow.core.framework.node_def_pb2.NodeDef, it
    #stores String-based info such as name, device and type of the op.
    #Unique to every Operation instance.
-    new_node_def = deepcopy(op._node_def)
+    new_node_def = deepcopy(op.node_def)
    #Change the name
    new_node_def.name = new_name
@ -211,7 +211,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
    #Make a copy of the op_def too.
    #Its unique to every _type_ of Operation.
-    op_def = deepcopy(op._op_def)
+    op_def = deepcopy(op.op_def)
    #Initialize a new Operation instance
    new_op = ops.Operation(new_node_def, to_graph, new_inputs, output_types,
--- a/tensorflow/contrib/data/init.py
+++ b/tensorflow/contrib/data/init.py
@ -25,6 +25,7 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
@@Counter
@@SqlDataset
@@assert_element_shape
@@batch_and_drop_remainder
@@bucket_by_sequence_length
@@dense_to_sparse_batch
@ -55,6 +56,7 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.data.python.ops.batching import assert_element_shape
 from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch
 from tensorflow.contrib.data.python.ops.batching import map_and_batch
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@ -21,6 +21,7 @@ py_test(
        "//tensorflow/python:dtypes",
        "//tensorflow/python:errors",
        "//tensorflow/python:math_ops",
        "//tensorflow/python:script_ops",
        "//tensorflow/python:sparse_tensor",
        "//tensorflow/python:string_ops",
        "//tensorflow/python:tensor_shape",
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@ -28,8 +28,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@ -579,5 +581,73 @@ class PaddedBatchDatasetSerializationTest(
                        lambda: build_dataset(seq_lens2), 8)
 class RestructuredDatasetTest(test.TestCase):
  def test_assert_element_shape(self):
    def create_unknown_shape_dataset(x):
      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
                                           np.zeros((3, 4), dtype=np.int32)),
                                [x],
                                [dtypes.float32, dtypes.int32])
    dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
    unknown_shapes = (tensor_shape.TensorShape(None),
                      tensor_shape.TensorShape(None))
    self.assertEqual(unknown_shapes, dataset.output_shapes)
    expected_shapes = (tensor_shape.TensorShape(2),
                       tensor_shape.TensorShape((3, 4)))
    result = dataset.apply(batching.assert_element_shape(expected_shapes))
    self.assertEqual(expected_shapes, result.output_shapes)
    iterator = result.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()
    with self.test_session() as sess:
      sess.run(init_op)
      for _ in range(5):
        sess.run(get_next)
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
  def test_assert_wrong_element_shape(self):
    def create_dataset(_):
      return (array_ops.ones(2, dtype=dtypes.float32),
              array_ops.zeros((3, 4), dtype=dtypes.int32))
    dataset = dataset_ops.Dataset.range(3).map(create_dataset)
    wrong_shapes = (tensor_shape.TensorShape(2),
                    tensor_shape.TensorShape((3, 10)))
    with self.assertRaises(ValueError):
      dataset.apply(batching.assert_element_shape(wrong_shapes))
  def test_assert_wrong_element_shape_on_unknown_shape_dataset(self):
    def create_unknown_shape_dataset(x):
      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
                                           np.zeros((3, 4), dtype=np.int32)),
                                [x],
                                [dtypes.float32, dtypes.int32])
    dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
    unknown_shapes = (tensor_shape.TensorShape(None),
                      tensor_shape.TensorShape(None))
    self.assertEqual(unknown_shapes, dataset.output_shapes)
    wrong_shapes = (tensor_shape.TensorShape(2),
                    tensor_shape.TensorShape((3, 10)))
    iterator = (
        dataset.apply(batching.assert_element_shape(wrong_shapes))
        .make_initializable_iterator())
    init_op = iterator.initializer
    get_next = iterator.get_next()
    with self.test_session() as sess:
      sess.run(init_op)
      with self.assertRaises(errors.InvalidArgumentError):
        sess.run(get_next)
 if __name__ == "__main__":
  test.main()
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@ -47,6 +47,11 @@ class SequenceDatasetSerializationTest(
    # Skip nothing
    self.run_core_tests(lambda: self._build_skip_dataset(0), None, 10)
  def testInvalidSkip(self):
    with self.assertRaisesRegexp(ValueError,
                                 'Shape must be rank 0 but is rank 1'):
      self.run_core_tests(lambda: self._build_skip_dataset([1, 2]), None, 0)
  def _build_take_dataset(self, count):
    components = (np.arange(10),)
    return dataset_ops.Dataset.from_tensor_slices(components).take(count)
@ -69,6 +74,11 @@ class SequenceDatasetSerializationTest(
    # Take nothing
    self.run_core_tests(lambda: self._build_take_dataset(0), None, 0)
  def testInvalidTake(self):
    with self.assertRaisesRegexp(ValueError,
                                 'Shape must be rank 0 but is rank 1'):
      self.run_core_tests(lambda: self._build_take_dataset([1, 2]), None, 0)
  def _build_repeat_dataset(self, count, take_count=3):
    components = (np.arange(10),)
    return dataset_ops.Dataset.from_tensor_slices(components).take(
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@ -112,6 +112,7 @@ py_library(
    srcs = ["batching.py"],
    srcs_version = "PY2AND3",
    deps = [
        "//tensorflow/contrib/framework:framework_py",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:dataset_ops_gen",
        "//tensorflow/python:dtypes",
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from tensorflow.contrib.framework import with_shape
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
@ -345,6 +346,46 @@ class _RestructuredDataset(dataset_ops.Dataset):
    return self._output_shapes
 def assert_element_shape(expected_shapes):
  """Assert the shape of this `Dataset`.
  ```python
  shapes = [tf.TensorShape([16, 256]), tf.TensorShape(None)]
  result = dataset.apply(tf.contrib.data.assert_element_shape(shapes))
  print(result.output_shapes)  # ==> "((16, 256), <unknown>)"
  ```
  If dataset shapes and expected_shape, are fully defined, assert they match.
  Otherwise, add assert op that will validate the shapes when tensors are
  evaluated, and set shapes on tensors, respectively.
  Args:
    expected_shapes: A nested structure of `tf.TensorShape` objects.
  Returns:
    A `Dataset` transformation function, which can be passed to
    @{tf.data.Dataset.apply}
  """
  def _check_shape(*elements):
    flatten_tensors = nest.flatten(elements)
    flatten_shapes = nest.flatten(expected_shapes)
    checked_tensors = [
        with_shape(shape, tensor)
        for shape, tensor in zip(flatten_shapes, flatten_tensors)
    ]
    return nest.pack_sequence_as(elements, checked_tensors)
  def _apply_fn(dataset):
    return _RestructuredDataset(
        dataset.map(_check_shape),
        dataset.output_types,
        output_shapes=expected_shapes,
        output_classes=dataset.output_classes)
  return _apply_fn
 class _MapAndBatchDataset(dataset_ops.MapDataset):
  """A `Dataset` that maps a function over a batch of elements."""
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@ -73,7 +73,7 @@ class DistributedValues(object):
  @property
  def devices(self):
-    return self._index.keys()
+    return list(self._index.keys())
  def __str__(self):
    return "%s:%s" % (self.__class__.__name__, self._index)
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@ -43,10 +43,10 @@ def sparse_multiclass_hinge_loss(
  This is a generalization of standard (binary) hinge loss. For a given instance
  with correct label c*, the loss is given by:
-    loss = max_{c != c*} logits_c - logits_{c*} + 1.
+    $$loss = max_{c != c*} logits_c - logits_{c*} + 1.$$
  or equivalently
-    loss = max_c { logits_c - logits_{c*} + I_{c != c*} }
+    $$loss = max_c { logits_c - logits_{c*} + I_{c != c*} }$$
-  where I_{c != c*} = 1 if c != c* and 0 otherwise.
+  where \\(I_{c != c*} = 1\ \text{if}\ c != c*\\) and 0 otherwise.
  Args:
    labels: `Tensor` of shape [batch_size] or [batch_size, 1]. Corresponds to
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
@ -34,33 +34,31 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
  r"""Class that implements Random Fourier Feature Mapping (RFFM) in TensorFlow.
  The RFFM mapping is used to approximate the Gaussian (RBF) kernel:
-  ```
+  $$(exp(-||x-y||_2^2 / (2 * \sigma^2))$$
  exp(-||x-y||_2^2 / (2 * sigma^2))
  ```
  The implementation of RFFM is based on the following paper:
  "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
  (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
-  The mapping uses a matrix `Omega \in R^{d x D}` and a bias vector `b \in R^D`
+  The mapping uses a matrix \\(\Omega \in R^{d x D}\\) and a bias vector
-  where `d` is the input dimension (number of dense input features) and `D` is
+  \\(b \in R^D\\) where \\(d\\) is the input dimension (number of dense input
-  the output dimension (i.e., dimension of the feature space the input is mapped
+  features) and \\(D\\) is the output dimension (i.e., dimension of the feature
-  to). Each entry of `Omega` is sampled i.i.d. from a (scaled) Gaussian
+  space the input is mapped to). Each entry of \\(\Omega\\) is sampled i.i.d.
-  distribution and each entry of `b` is sampled independently and uniformly from
+  from a (scaled) Gaussian distribution and each entry of \\(b\\) is sampled
-  [0, 2 * pi].
+  independently and uniformly from [0, \\(2 * \pi\\)].
-  For a single input feature vector x in R^d, its RFFM is defined as:
+  For a single input feature vector \\(x \in R^d\\), its RFFM is defined as:
-  ```
+  $$\sqrt(2/D) * cos(x * \Omega + b)$$
-      sqrt(2/D) * cos(x * Omega + b)
+
-  ```
+  where \\(cos\\) is the element-wise cosine function and \\(x, b\\) are
-  where `cos` is the element-wise cosine function and `x, b` are represented as
+  represented as row vectors. The aforementioned paper shows that the linear
-  row vectors. The aforementioned paper shows that the linear kernel of
+  kernel of RFFM-mapped vectors approximates the Gaussian kernel of the initial
-  RFFM-mapped vectors approximates the Gaussian kernel of the initial vectors.
+  vectors.
  """
  def __init__(self, input_dim, output_dim, stddev=1.0, seed=1, name=None):
-    """Constructs a RandomFourierFeatureMapper instance.
+    r"""Constructs a RandomFourierFeatureMapper instance.
    Args:
      input_dim: The dimension (number of features) of the tensors to be mapped.
@ -68,11 +66,11 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
      stddev: The standard deviation of the Gaussian kernel to be approximated.
        The error of the classifier trained using this approximation is very
        sensitive to this parameter.
-      seed: An integer used to initialize the parameters (`Omega` and `b`) of
+      seed: An integer used to initialize the parameters (\\(\Omega\\) and
-        the mapper. For repeatable sequences across different invocations of the
+        \\(b\\)) of the mapper. For repeatable sequences across different
-        mapper object (for instance, to ensure consistent mapping both at
+        invocations of the mapper object (for instance, to ensure consistent
-        training and eval/inference if these happen in different invocations),
+        mapping both at training and eval/inference if these happen in
-        set this to the same integer.
+        different invocations), set this to the same integer.
      name: name for the mapper object.
    """
    # TODO(sibyl-vie3Poto): Maybe infer input_dim and/or output_dim (if not explicitly
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
@ -34,7 +34,7 @@ def _inner_product(x, y):
  """Inner product between tensors x and y.
  The input tensors are assumed to be in ROW representation, that is, the method
-  returns x * y^T.
+  returns \\(x * y^T\\).
  Args:
    x: input tensor in row format
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@ -19,11 +19,11 @@ Information matrix. Suppose one has a model that parameterizes a posterior
 distribution over 'y' given 'x' with parameters 'params', p(y | x, params). Its
 Fisher Information matrix is given by,
-  F(params) = E[ v(x, y, params) v(x, y, params)^T ]
+  $$F(params) = E[ v(x, y, params) v(x, y, params)^T ]$$
 where,
-  v(x, y, params) = (d / d params) log p(y | x, params)
+  $$v(x, y, params) = (d / d params) log p(y | x, params)$$
 and the expectation is taken with respect to the data's distribution for 'x' and
 the model's posterior distribution for 'y',
@ -85,7 +85,7 @@ def normalize_damping(damping, num_replications):
 def compute_pi_tracenorm(left_cov, right_cov):
  """Computes the scalar constant pi for Tikhonov regularization/damping.
-  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
+  $$\pi = \sqrt{ (trace(A) / dim(A)) / (trace(B) / dim(B)) }$$
  See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
  Args:
@ -462,14 +462,14 @@ class FullyConnectedDiagonalFB(InputOutputMultiTower, FisherBlock):
  Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
  into it. We are interested in Fisher(params)[i, i]. This is,
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
  Consider fully connected layer in this model with (unshared) weight matrix
  'w'. For an example 'x' that produces layer inputs 'a' and output
  preactivations 's',
-    v(x, y, w) = vec( a (d loss / d s)^T )
+    $$v(x, y, w) = vec( a (d loss / d s)^T )$$
  This FisherBlock tracks Fisher(params)[i, i] for all indices 'i' corresponding
  to the layer's parameters 'w'.
@ -532,14 +532,14 @@ class ConvDiagonalFB(InputOutputMultiTower, FisherBlock):
  Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
  into it. We are interested in Fisher(params)[i, i]. This is,
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
  Consider a convoluational layer in this model with (unshared) filter matrix
  'w'. For an example image 'x' that produces layer inputs 'a' and output
  preactivations 's',
-    v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )
+    $$v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )$$
  where 'loc' is a single (x, y) location in an image.
@ -805,12 +805,12 @@ class ConvKFCBasicFB(InputOutputMultiTower, KroneckerProductFB):
  'w'. For a minibatch that produces inputs 'a' and output preactivations 's',
  this FisherBlock estimates,
-    F(w) = #locations * kronecker(E[flat(a) flat(a)^T],
+    $$F(w) = \#locations * kronecker(E[flat(a) flat(a)^T],
-                                  E[flat(ds) flat(ds)^T])
+                                  E[flat(ds) flat(ds)^T])$$
  where
-    ds = (d / ds) log p(y | x, w)
+    $$ds = (d / ds) log p(y | x, w)$$
    #locations = number of (x, y) locations where 'w' is applied.
  where the expectation is taken over all examples and locations and flat()
@ -1567,7 +1567,7 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
    if self._option == SeriesFBApproximation.option1:
-      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
+      # Note that \\(L_A = A0^{-1/2} * U_A and L_G = G0^{-1/2} * U_G.\\)
      L_A, psi_A = self._input_factor.get_option1quants(
          self._input_damping_func)
      L_G, psi_G = self._output_factor.get_option1quants(
@ -1581,33 +1581,33 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
        T = self._num_timesteps
        return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))
-      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
+      # \\(Y = \gamma( psi_G*psi_A^T )\\) (computed element-wise)
      # Even though Y is Z-independent we are recomputing it from the psi's
      # each since Y depends on both A and G quantities, and it is relatively
      # cheap to compute.
      Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)
-      # Z = L_G^T * Z * L_A
+      # \\(Z = L_G^T * Z * L_A\\)
      # This is equivalent to the following computation from the original
      # pseudo-code:
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
-      # Z = U_G^T * Z * U_A
+      # \\(Z = U_G^T * Z * U_A\\)
      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)
-      # Z = Z .* Y
+      # \\(Z = Z .* Y\\)
      Z *= Y
-      # Z = L_G * Z * L_A^T
+      # \\(Z = L_G * Z * L_A^T\\)
      # This is equivalent to the following computation from the original
      # pseudo-code:
-      # Z = U_G * Z * U_A^T
+      # \\(Z = U_G * Z * U_A^T\\)
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))
    elif self._option == SeriesFBApproximation.option2:
-      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
+      # Note that \\(P_A = A_1^T * A_0^{-1} and P_G = G_1^T * G_0^{-1}\\),
-      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
+      # and \\(K_A = A_0^{-1/2} * E_A\ and\ K_G = G_0^{-1/2} * E_G.\\)
      P_A, K_A, mu_A = self._input_factor.get_option2quants(
          self._input_damping_func)
      P_G, K_G, mu_G = self._output_factor.get_option2quants(
@ -1616,26 +1616,26 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
      # Our approach differs superficially from the pseudo-code in the paper
      # in order to reduce the total number of matrix-matrix multiplies.
      # In particular, the first three computations in the pseudo code are
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
-      # Z = Z - hPsi_G^T * Z * hPsi_A
+      # \\(Z = Z - hPsi_G^T * Z * hPsi_A\\)
-      # Z = E_G^T * Z * E_A
+      # \\(Z = E_G^T * Z * E_A\\)
-      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
+      # Noting that hPsi = C0^{-1/2} * C1 * C0^{-1/2}\\), so that
-      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
+      # \\(C0^{-1/2} * hPsi = C0^{-1} * C1 * C0^{-1/2} = P^T * C0^{-1/2}\\)
      # the entire computation can be written as
-      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
+      # \\(Z = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
-      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
+      # \\(    - hPsi_G^T * G0^{-1/2} * Z * A0^{-1/2} * hPsi_A) * E_A\\)
-      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
+      # \\(  = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
-      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
+      # \\(    - G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2}) * E_A\\)
-      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
+      # \\(  = E_G^T * G0^{-1/2} * Z * A0^{-1/2} * E_A\\)
-      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
+      # \\(    -  E_G^T* G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2} * E_A\\)
-      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
+      # \\(  = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A\\)
      # This final expression is computed by the following two lines:
-      # Z = Z - P_G * Z * P_A^T
+      # \\(Z = Z - P_G * Z * P_A^T\\)
      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
-      # Z = K_G^T * Z * K_A
+      # \\(Z = K_G^T * Z * K_A\\)
      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)
-      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
+      # \\(Z = Z ./ (1*1^T - mu_G*mu_A^T)\\)
      # Be careful with the outer product.  We don't want to accidentally
      # make it an inner-product instead.
      tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
@ -1646,13 +1646,13 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
      # We now perform the transpose/reverse version of the operations
      # derived above, whose derivation from the original pseudo-code is
      # analgous.
-      # Z = K_G * Z * K_A^T
+      # \\(Z = K_G * Z * K_A^T\\)
      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))
-      # Z = Z - P_G^T * Z * P_A
+      # \\(Z = Z - P_G^T * Z * P_A\\)
      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)
-      # Z = normalize (1/E[T]) * Z
+      # \\(Z = normalize (1/E[T]) * Z\\)
      # Note that this normalization is done because we compute the statistics
      # by averaging, not summing, over time. (And the gradient is presumably
      # summed over time, not averaged, and thus their scales are different.)
--- a/tensorflow/contrib/lite/build_ios_universal_lib.sh
+++ b/tensorflow/contrib/lite/build_ios_universal_lib.sh
@ -19,11 +19,16 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../.."
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8 \
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8
+$SCRIPT_DIR/gen/lib/ios_x86_64/libtensorflow-lite.a
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8 \
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8
+$SCRIPT_DIR/gen/lib/ios_i386/libtensorflow-lite.a
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8 \
 $SCRIPT_DIR/gen/lib/ios_armv7/libtensorflow-lite.a
 make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8 \
 $SCRIPT_DIR/gen/lib/ios_armv7s/libtensorflow-lite.a
 make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8 \
 $SCRIPT_DIR/gen/lib/ios_arm64/libtensorflow-lite.a
 lipo \
 tensorflow/contrib/lite/gen/lib/ios_x86_64/libtensorflow-lite.a \
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@ -63,6 +63,8 @@ def _safe_div(numerator, denominator, name):
      name=name)
@deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the '
            'order of the labels and predictions arguments has been switched.')
 def streaming_true_positives(predictions,
                             labels,
                             weights=None,
@ -107,6 +109,8 @@ def streaming_true_positives(predictions,
      name=name)
@deprecated(None, 'Please switch to tf.metrics.true_negatives. Note that the '
            'order of the labels and predictions arguments has been switched.')
 def streaming_true_negatives(predictions,
                             labels,
                             weights=None,
@ -151,6 +155,8 @@ def streaming_true_negatives(predictions,
      name=name)
@deprecated(None, 'Please switch to tf.metrics.false_positives. Note that the '
            'order of the labels and predictions arguments has been switched.')
 def streaming_false_positives(predictions,
                              labels,
                              weights=None,
@ -195,6 +201,8 @@ def streaming_false_positives(predictions,
      name=name)
@deprecated(None, 'Please switch to tf.metrics.false_negatives. Note that the '
            'order of the labels and predictions arguments has been switched.')
 def streaming_false_negatives(predictions,
                              labels,
                              weights=None,
@ -238,6 +246,7 @@ def streaming_false_negatives(predictions,
      name=name)
@deprecated(None, 'Please switch to tf.metrics.mean')
 def streaming_mean(values,
                   weights=None,
                   metrics_collections=None,
@ -287,6 +296,7 @@ def streaming_mean(values,
      name=name)
@deprecated(None, 'Please switch to tf.metrics.mean_tensor')
 def streaming_mean_tensor(values,
                          weights=None,
                          metrics_collections=None,
@ -340,9 +350,8 @@ def streaming_mean_tensor(values,
      name=name)
-@deprecated(None,
+@deprecated(None, 'Please switch to tf.metrics.accuracy. Note that the order '
-            'Please switch to tf.metrics.accuracy. Note that the order of the '
+            'of the labels and predictions arguments has been switched.')
            'labels and predictions arguments has been switched.')
 def streaming_accuracy(predictions,
                       labels,
                       weights=None,
@ -400,6 +409,8 @@ def streaming_accuracy(predictions,
      name=name)
@deprecated(None, 'Please switch to tf.metrics.precision. Note that the order '
            'of the labels and predictions arguments has been switched.')
 def streaming_precision(predictions,
                        labels,
                        weights=None,
@ -456,6 +467,8 @@ def streaming_precision(predictions,
      name=name)
@deprecated(None, 'Please switch to tf.metrics.recall. Note that the order '
            'of the labels and predictions arguments has been switched.')
 def streaming_recall(predictions,
                     labels,
                     weights=None,
@ -975,8 +988,8 @@ def streaming_curve_points(labels=None,
    return points, update_op
-@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of the '
+@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of '
-            'labels and predictions arguments has been switched.')
+            'the labels and predictions arguments has been switched.')
 def streaming_auc(predictions,
                  labels,
                  weights=None,
@ -1797,9 +1810,9 @@ def streaming_sensitivity_at_specificity(predictions,
      name=name)
-@deprecated(
+@deprecated(None,
-    None, 'Please switch to tf.metrics.precision_at_thresholds. Note that the '
+            'Please switch to tf.metrics.precision_at_thresholds. Note that '
-    'order of the labels and predictions arguments has been switched.')
+            'the order of the labels and predictions arguments are switched.')
 def streaming_precision_at_thresholds(predictions,
                                      labels,
                                      thresholds,
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@ -2891,7 +2891,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
    output_size = weight.get_shape().as_list()[1]
    g = vs.get_variable(name, [output_size], dtype=weight.dtype)
-    return nn_impl.l2_normalize(weight, dim=0) * g
+    return nn_impl.l2_normalize(weight, axis=0) * g
  def _linear(self,
              args,
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@ -610,8 +610,8 @@ def monotonic_attention(p_choose_i, previous_attention, mode):
  addition, once an input sequence element is attended to at a given output
  timestep, elements occurring before it cannot be attended to at subsequent
  output timesteps.  This function generates attention distributions according
-  to these assumptions.  For more information, see ``Online and Linear-Time
+  to these assumptions.  For more information, see `Online and Linear-Time
-  Attention by Enforcing Monotonic Alignments''.
+  Attention by Enforcing Monotonic Alignments`.
  Args:
    p_choose_i: Probability of choosing input sequence/memory element i.  Should
--- a/tensorflow/contrib/sparsemax/init.py
+++ b/tensorflow/contrib/sparsemax/init.py
@ -14,7 +14,7 @@
 # ==============================================================================
 """Module that implements sparsemax and sparsemax loss, see [1].
-[1] https://arxiv.org/abs/1602.02068
+[1]: https://arxiv.org/abs/1602.02068
 ## Sparsemax
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@ -31,7 +31,7 @@ def sparsemax(logits, name=None):
  """Computes sparsemax activations [1].
  For each batch `i` and class `j` we have
-    sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)
+    $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$
  [1]: https://arxiv.org/abs/1602.02068
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@ -405,7 +405,13 @@ tensorflow::Status ConvertGraphDefToTensorRT(
                         max_mem_per_engine, static_graph_properties,
                         &output_edge_map, precision_mode);
    if (precision_mode == INT8MODE) {
-      TF_RETURN_IF_ERROR(GetCalibNode(&p));
+      tensorflow::Status status = GetCalibNode(&p);
      if (status != tensorflow::Status::OK()) {
        LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
                     << " due to: \"" << status.ToString()
                     << "\" SKIPPING......( " << subgraph_node_names.size()
                     << " nodes)";
      }
    } else {
      tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
      if (status != tensorflow::Status::OK()) {
@ -414,8 +420,8 @@ tensorflow::Status ConvertGraphDefToTensorRT(
                     << "\" SKIPPING......( " << subgraph_node_names.size()
                     << " nodes)";
      }
      count++;
    }
    count++;
  }
  graph.ToGraphDef(new_graph_def);
  return tensorflow::Status::OK();
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@ -443,7 +443,9 @@ class Converter {
       * 2) Control dependency inputs contain caret at the beginning and we
       *    remove this and annotate the edge as a control dependency.
       ************************************************************************/
-      string name = input_name[0] == '^' ? input_name.substr(1) : input_name;
+      // skip control nodes
      if (input_name[0] == '^') continue;
      string name = input_name;
      auto first = name.find_first_of(':');
      if (first != string::npos && first + 2 == name.size() &&
          name[first + 1] == '0')
@ -2262,6 +2264,7 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
  auto ws = new tensorflow::tensorrt::TRTWeightStore();
  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
  std::vector<string> input_names;
  std::vector<tensorflow::DataType> input_dtypes;
  for (const std::pair<int, int>& input : s.input_inds) {
@ -2270,20 +2273,41 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
    int output_idx = input.second;
    tensorflow::Node* node = s.graph.FindNodeId(node_id);
    auto node_name = node->name();
-    input_names.push_back(node_name);  // insert original node name without port
+    // input_names should use the node name in the graph
-    // TODO(jie): alternative :)
+    // here it should be the input tensor name -> matching the binding
-    if (!s.graph_properties.HasOutputProperties(node_name))
+    // insert original node name without port
    auto tensor_name = node_name;
    if (output_idx != 0) {
      tensor_name = StrCat(tensor_name, ":", output_idx);
    }
    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
            << " idx: " << output_idx;
    auto shape_inference_node_name = node_name;
    auto shape_inference_output_idx = output_idx;
    // rewire the shape inference to original node in the graph
    if (s.output_edge_map->count(tensor_name)) {
      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
    }
    if (shape_inference_output_idx < 0) continue;
    VLOG(2) << "shapeinference name: " << shape_inference_node_name
            << " idx: " << shape_inference_output_idx;
    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
      return tensorflow::errors::Internal("failed to find input node: " +
-                                          node_name);
+                                          shape_inference_node_name);
-    auto op_info_vec = s.graph_properties.GetOutputProperties(node_name);
+    auto op_info_vec =
-    if (static_cast<int>(op_info_vec.size()) < output_idx)
+        s.graph_properties.GetOutputProperties(shape_inference_node_name);
    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
      return tensorflow::errors::Internal(
-          "accessing output index of: ", output_idx, ", at node: ", node_name,
+          "accessing output index of: ", shape_inference_output_idx,
-          "with output entry from shape_map: ", op_info_vec.size());
+          ", at node: ", shape_inference_node_name,
-
+          " with output entry from shape_map: ", op_info_vec.size());
    auto op_info = op_info_vec.at(output_idx);
    auto op_info = op_info_vec.at(shape_inference_output_idx);
    tensorflow::DataType tf_dtype = op_info.dtype();
    input_dtypes.push_back(tf_dtype);
@ -2294,16 +2318,23 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
                   << "' failed";
      return type_status;
    }
    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
    VLOG(2) << "accessing output index of: " << output_idx
            << ", at node: " << node_name
            << "with output entry from shape_map: " << op_info_vec.size();
    // TODO(ben,jie): update TRT input format/dimension
    nvinfer1::DimsCHW input_dim_psuedo_chw;
    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
    //            update the code once TRT 4.0 comes out.
    if (op_info.shape().dim_size() != 4) {
      string err_str = "Require 4 dimensional input.";
      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
                shape_inference_node_name);
      return tensorflow::errors::Unimplemented(err_str);
    }
    for (int i = 1; i < op_info.shape().dim_size(); i++) {
      VLOG(2) << "dimension: " << i
              << " , size: " << op_info.shape().dim(i).size();
@ -2312,8 +2343,11 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
    // TODO(ben,jie): proper way to restore input tensor name?
    auto input_tensor_name = node_name;
-    if (output_idx != 0) input_tensor_name = StrCat(node_name, ":", output_idx);
+    if (output_idx != 0) {
      input_tensor_name = StrCat(node_name, ":", output_idx);
    }
    input_names.push_back(input_tensor_name);
    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
@ -2377,11 +2411,13 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
    tensor->setType(trt_dtype);
  }
-  VLOG(2) << "finished output";
+  VLOG(2) << "Finished processing outputs";
  // Build the engine
  op_res->builder_->setMaxBatchSize(s.max_batch_size);
  op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes);
  VLOG(0) << "Max batch size= " << s.max_batch_size
          << " max workspace size= " << s.max_workspace_size_bytes;
  // Build the TRT op
  // TODO(sami,ben,jie): proper naming!
@ -2475,7 +2511,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
  std::vector<string> input_names;
  std::vector<tensorflow::DataType> input_dtypes;
  for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input!!!!!";
+    VLOG(2) << "parsing input. Node id= " << input.first;
    int node_id = input.first;
    int output_idx = input.second;
    tensorflow::Node* node = s.graph.FindNodeId(node_id);
--- a/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
@ -0,0 +1,36 @@
 op {
  graph_op_name: "ClipByValue"
  in_arg {
    name: "t"
    description: <<END
 A `Tensor`.
 END
  }
  in_arg {
    name: "clip_value_min"
    description: <<END
 A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
 as `t`. The minimum value to clip by.
 END
  }
  in_arg {
    name: "clip_value_max"
    description: <<END
 A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
 as `t`. The maximum value to clip by.
 END
  }
  out_arg {
    name: "output"
    description: <<END
 A clipped `Tensor` with the same shape as input 't'.
 END
  }
  summary: "Clips tensor values to a specified min and max."
  description: <<END
 Given a tensor `t`, this operation returns a tensor of the same type and
 shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
 Any values less than `clip_value_min` are set to `clip_value_min`. Any values
 greater than `clip_value_max` are set to `clip_value_max`.
 END
 }
--- a/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
@ -0,0 +1,4 @@
 op {
  graph_op_name: "ClipByValue"
  visibility: HIDDEN
 }
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 #ifdef INTEL_MKL
 #include <omp.h>
 #endif
 #include <string.h>
 #include "tensorflow/core/lib/core/threadpool.h"
@ -47,10 +50,24 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
 }
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
-  const int32 t = options.config.inter_op_parallelism_threads();
+  const int32 inter_op = options.config.inter_op_parallelism_threads();
-  if (t != 0) return t;
+  if (inter_op != 0) return inter_op;
 #ifdef INTEL_MKL
  // MKL library executes ops in parallel using OMP threads
  // Set inter_op conservatively to avoid thread oversubscription that could
  // lead to severe perf degradations and OMP resource exhaustion
  const int mkl_intra_op = omp_get_max_threads();
  CHECK_GE(mkl_intra_op, 1);
  const int32 mkl_inter_op = std::max(
      (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
  VLOG(0) << "Creating new thread pool with default inter op setting: "
          << mkl_inter_op
          << ". Tune using inter_op_parallelism_threads for best performance.";
  return mkl_inter_op;
 #else
  // Default to using the number of cores available in the process.
  return port::NumSchedulableCPUs();
 #endif
 }
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@ -11,6 +11,10 @@ load(
    "//tensorflow/core:platform/default/build_config.bzl",
    "tf_protos_grappler",
 )
 load(
    "//tensorflow/core:platform/default/build_config_root.bzl",
    "if_static",
 )
 cc_library(
    name = "static_schedule",
@ -537,11 +541,28 @@ tf_cuda_cc_test(
    ],
 )
 # This rule is header-only unless the build is static (--config=monolithic). Its
 # implementation is included directly in the framework shared object.
 cc_library(
    name = "custom_graph_optimizer_registry",
    srcs = ["custom_graph_optimizer_registry.cc"],
    hdrs = ["custom_graph_optimizer_registry.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":custom_graph_optimizer",
        "//tensorflow/core:lib",
    ] + if_static(
        [":custom_graph_optimizer_registry_impl"],
    ),
 )
 # This rule contains static variables for the optimizer registry. Do not depend
 # on it directly; use :custom_graph_optimizer_registry, and link against
 # libtensorflow_framework.so for the registry symbols.
 cc_library(
    name = "custom_graph_optimizer_registry_impl",
    srcs = ["custom_graph_optimizer_registry.cc"],
    hdrs = ["custom_graph_optimizer_registry.h"],
    visibility = ["//tensorflow:__subpackages__"],
    deps = [
        ":custom_graph_optimizer",
        "//tensorflow/core:lib",
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -3549,6 +3549,7 @@ tf_kernel_library(
        "pooling_ops_3d_gpu.cu.cc",
    ],
    deps = [
        ":bounds_check",
        ":conv_2d",
        ":conv_3d",
        ":conv_ops",
@ -3559,6 +3560,7 @@ tf_kernel_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:nn_ops_op_lib",
        "//tensorflow/core:stream_executor",
        "//third_party/eigen3",
    ],
 )
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@ -18,9 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Abs", functor::abs, float, Eigen::half, double, int32,
          int64);
 #if !defined(IS_MOBILE_PLATFORM)
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
 #endif
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Abs", functor::abs, float, Eigen::half, double, int64);
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@ -0,0 +1,225 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/cwise_op_clip.h"
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 // Basic coefficient-wise tenary operations.
 // This is the case for example of the clip_by_value.
 //   Device: E.g., CPUDevice, GPUDevice.
 //   Functor: defined above. E.g., functor::clip.
 template <typename Device, typename T>
 class ClipOp : public OpKernel {
 public:
  explicit ClipOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
  void Compute(OpKernelContext* ctx) override {
    const Tensor& in0 = ctx->input(0);
    const Tensor& in1 = ctx->input(1);
    const Tensor& in2 = ctx->input(2);
    auto in0_flat = in0.flat<T>();
    auto in1_flat = in1.flat<T>();
    auto in2_flat = in2.flat<T>();
    const Device& d = ctx->eigen_device<Device>();
    Tensor* out = nullptr;
    OP_REQUIRES_OK(
        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
    auto out_flat = out->flat<T>();
    if (in1.shape() == in2.shape()) {
      if (in0.shape() == in1.shape()) {
        functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                            out_flat);
      } else {
        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
                    errors::InvalidArgument(
                        "clip_value_min and clip_value_max must be either of "
                        "the same shape as input, or a scalar. ",
                        "input shape: ", in0.shape().DebugString(),
                        "clip_value_min shape: ", in1.shape().DebugString(),
                        "clip_value_max shape: ", in2.shape().DebugString()));
        functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                          out_flat);
      }
    } else {
      if (in0.shape() == in1.shape()) {
        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
                    errors::InvalidArgument(
                        "clip_value_min and clip_value_max must be either of "
                        "the same shape as input, or a scalar. ",
                        "input shape: ", in0.shape().DebugString(),
                        "clip_value_min shape: ", in1.shape().DebugString(),
                        "clip_value_max shape: ", in2.shape().DebugString()));
        functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                               out_flat);
      } else {
        OP_REQUIRES(ctx,
                    (in0.shape() == in2.shape() &&
                     TensorShapeUtils::IsScalar(in1.shape())),
                    errors::InvalidArgument(
                        "clip_value_min and clip_value_max must be either of "
                        "the same shape as input, or a scalar. ",
                        "input shape: ", in0.shape().DebugString(),
                        "clip_value_min shape: ", in1.shape().DebugString(),
                        "clip_value_max shape: ", in2.shape().DebugString()));
        functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                out_flat);
      }
    }
  }
 };
 namespace functor {
 // Unary functor for clip [Tensor, Scalar, Scalar]
 template <typename T>
 struct UnaryClipFunc {
  UnaryClipFunc(const T& value_min, const T& value_max)
      : value_min(value_min), value_max(value_max) {}
  const T operator()(const T& value) const {
    return std::max(std::min(value, value_max), value_min);
  }
  T value_min;
  T value_max;
 };
 template <typename T>
 struct UnaryClipOp<CPUDevice, T> {
  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
                  typename TTypes<T>::ConstFlat& in1_flat,
                  typename TTypes<T>::ConstFlat& in2_flat,
                  typename TTypes<T>::Flat& out_flat) const {
    out_flat = in0_flat.unaryExpr(UnaryClipFunc<T>(in1_flat(0), in2_flat(0)));
  }
 };
 // Binary functor for clip [Tensor, Scalar, Tensor]
 template <typename T>
 struct BinaryRightClipFunc {
  explicit BinaryRightClipFunc(const T& value_min) : value_min(value_min) {}
  const T operator()(const T& value, const T& value_max) const {
    return std::max(std::min(value, value_max), value_min);
  }
  T value_min;
 };
 template <typename T>
 struct BinaryRightClipOp<CPUDevice, T> {
  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
                  typename TTypes<T>::ConstFlat& in1_flat,
                  typename TTypes<T>::ConstFlat& in2_flat,
                  typename TTypes<T>::Flat& out_flat) const {
    out_flat =
        in0_flat.binaryExpr(in2_flat, BinaryRightClipFunc<T>(in1_flat(0)));
  }
 };
 // Binary functor for clip [Tensor, Tensor, Scalar]
 template <typename T>
 struct BinaryLeftClipFunc {
  explicit BinaryLeftClipFunc(const T& value_max) : value_max(value_max) {}
  const T operator()(const T& value, const T& value_min) const {
    return std::max(std::min(value, value_max), value_min);
  }
  T value_max;
 };
 template <typename T>
 struct BinaryLeftClipOp<CPUDevice, T> {
  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
                  typename TTypes<T>::ConstFlat& in1_flat,
                  typename TTypes<T>::ConstFlat& in2_flat,
                  typename TTypes<T>::Flat& out_flat) const {
    out_flat =
        in0_flat.binaryExpr(in1_flat, BinaryLeftClipFunc<T>(in2_flat(0)));
  }
 };
 // Ternary functor for clip [Tensor, Tensor, Tensor]
 template <typename T>
 struct TernaryClipOp<CPUDevice, T> {
  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
                  typename TTypes<T>::ConstFlat& in1_flat,
                  typename TTypes<T>::ConstFlat& in2_flat,
                  typename TTypes<T>::Flat& out_flat) const {
    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
  }
 };
 #define INSTANTIATE_CPU(T)                         \
  template struct UnaryClipOp<CPUDevice, T>;       \
  template struct BinaryRightClipOp<CPUDevice, T>; \
  template struct BinaryLeftClipOp<CPUDevice, T>;  \
  template struct TernaryClipOp<CPUDevice, T>;
 INSTANTIATE_CPU(Eigen::half);
 INSTANTIATE_CPU(float);
 INSTANTIATE_CPU(double);
 INSTANTIATE_CPU(int8);
 INSTANTIATE_CPU(int16);
 INSTANTIATE_CPU(int32);
 INSTANTIATE_CPU(int64);
 INSTANTIATE_CPU(uint8);
 INSTANTIATE_CPU(uint16);
 #undef INSTANTIATE_CPU
 }  // namespace functor
 #define REGISTER_CPU_KERNEL(type)                                       \
  REGISTER_KERNEL_BUILDER(                                              \
      Name("ClipByValue").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
      ClipOp<CPUDevice, type>);
 REGISTER_CPU_KERNEL(Eigen::half);
 REGISTER_CPU_KERNEL(float);
 REGISTER_CPU_KERNEL(double);
 REGISTER_CPU_KERNEL(int8);
 REGISTER_CPU_KERNEL(int16);
 REGISTER_CPU_KERNEL(int32);
 REGISTER_CPU_KERNEL(int64);
 REGISTER_CPU_KERNEL(uint8);
 REGISTER_CPU_KERNEL(uint16);
 #undef REGISTER_CPU_KERNEL
 #if GOOGLE_CUDA
 #define REGISTER_GPU_KERNEL(type)                                       \
  REGISTER_KERNEL_BUILDER(                                              \
      Name("ClipByValue").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
      ClipOp<GPUDevice, type>);
 REGISTER_GPU_KERNEL(Eigen::half);
 REGISTER_GPU_KERNEL(float);
 REGISTER_GPU_KERNEL(double);
 REGISTER_GPU_KERNEL(int8);
 REGISTER_GPU_KERNEL(int16);
 REGISTER_GPU_KERNEL(int64);
 REGISTER_GPU_KERNEL(uint8);
 REGISTER_GPU_KERNEL(uint16);
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("ClipByValue")
                            .Device(DEVICE_GPU)
                            .HostMemory("t")
                            .HostMemory("clip_value_min")
                            .HostMemory("clip_value_max")
                            .HostMemory("output")
                            .TypeConstraint<int32>("T"),
                        ClipOp<CPUDevice, int32>);
 #undef REGISTER_GPU_KERNEL
 #endif
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_op_clip.h
+++ b/tensorflow/core/kernels/cwise_op_clip.h
@ -0,0 +1,61 @@
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
 #define TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 namespace tensorflow {
 namespace functor {
 // Unary functor for clip [Tensor, Scalar, Scalar]
 template <typename Device, typename T>
 struct UnaryClipOp {
  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
                  typename TTypes<T>::ConstFlat &in1_flat,
                  typename TTypes<T>::ConstFlat &in2_flat,
                  typename TTypes<T>::Flat &out_flat) const;
 };
 // Binary functor for clip [Tensor, Scalar, Tensor]
 template <typename Device, typename T>
 struct BinaryRightClipOp {
  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
                  typename TTypes<T>::ConstFlat &in1_flat,
                  typename TTypes<T>::ConstFlat &in2_flat,
                  typename TTypes<T>::Flat &out_flat) const;
 };
 // Binary functor for clip [Tensor, Tensor, Scalar]
 template <typename Device, typename T>
 struct BinaryLeftClipOp {
  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
                  typename TTypes<T>::ConstFlat &in1_flat,
                  typename TTypes<T>::ConstFlat &in2_flat,
                  typename TTypes<T>::Flat &out_flat) const;
 };
 // Ternary functor for clip [Tensor, Tensor, Tensor]
 template <typename Device, typename T>
 struct TernaryClipOp {
  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
                  typename TTypes<T>::ConstFlat &in1_flat,
                  typename TTypes<T>::ConstFlat &in2_flat,
                  typename TTypes<T>::Flat &out_flat) const;
 };
 }  // namespace functor
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_KERNELS_CWISE_OP_CLIP_H_
--- a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@ -0,0 +1,134 @@
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include "tensorflow/core/kernels/cwise_op_clip.h"
 #include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 namespace tensorflow {
 template <typename T>
 __global__ void UnaryClipCustomKernel(const int32 size_in, const T *in0,
                                      const T *in1, const T *in2, T *out) {
  CUDA_1D_KERNEL_LOOP(i, size_in) {
    T value = in2[0] < in0[i] ? in2[0] : in0[i];
    out[i] = value < in1[0] ? in1[0] : value;
  }
 }
 template <typename T>
 __global__ void BinaryRightClipCustomKernel(const int32 size_in, const T *in0,
                                            const T *in1, const T *in2,
                                            T *out) {
  CUDA_1D_KERNEL_LOOP(i, size_in) {
    T value = in2[i] < in0[i] ? in2[i] : in0[i];
    out[i] = value < in1[0] ? in1[0] : value;
  }
 }
 template <typename T>
 __global__ void BinaryLeftClipCustomKernel(const int32 size_in, const T *in0,
                                           const T *in1, const T *in2, T *out) {
  CUDA_1D_KERNEL_LOOP(i, size_in) {
    T value = in2[0] < in0[i] ? in2[0] : in0[i];
    out[i] = value < in1[i] ? in1[i] : value;
  }
 }
 namespace functor {
 // Unary functor for clip [Tensor, Scalar, Scalar]
 template <typename T>
 struct UnaryClipOp<GPUDevice, T> {
  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
                  typename TTypes<T>::ConstFlat &in1_flat,
                  typename TTypes<T>::ConstFlat &in2_flat,
                  typename TTypes<T>::Flat &out_flat) const {
    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
    UnaryClipCustomKernel<T>
        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
            out_flat.data());
  }
 };
 // Binary functor for clip [Tensor, Scalar, Tensor]
 template <typename T>
 struct BinaryRightClipOp<GPUDevice, T> {
  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
                  typename TTypes<T>::ConstFlat &in1_flat,
                  typename TTypes<T>::ConstFlat &in2_flat,
                  typename TTypes<T>::Flat &out_flat) const {
    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
    BinaryRightClipCustomKernel<T>
        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
            out_flat.data());
  }
 };
 // Binary functor for clip [Tensor, Tensor, Scalar]
 template <typename T>
 struct BinaryLeftClipOp<GPUDevice, T> {
  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
                  typename TTypes<T>::ConstFlat &in1_flat,
                  typename TTypes<T>::ConstFlat &in2_flat,
                  typename TTypes<T>::Flat &out_flat) const {
    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
    BinaryLeftClipCustomKernel<T>
        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
            in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
            out_flat.data());
  }
 };
 // Ternary functor for clip [Tensor, Tensor, Tensor]
 template <typename T>
 struct TernaryClipOp<GPUDevice, T> {
  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
                  typename TTypes<T>::ConstFlat &in1_flat,
                  typename TTypes<T>::ConstFlat &in2_flat,
                  typename TTypes<T>::Flat &out_flat) const {
    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
  }
 };
 #define INSTANTIATE_GPU(T)                         \
  template struct UnaryClipOp<GPUDevice, T>;       \
  template struct BinaryRightClipOp<GPUDevice, T>; \
  template struct BinaryLeftClipOp<GPUDevice, T>;  \
  template struct TernaryClipOp<GPUDevice, T>;
 INSTANTIATE_GPU(Eigen::half);
 INSTANTIATE_GPU(float);
 INSTANTIATE_GPU(double);
 INSTANTIATE_GPU(int8);
 INSTANTIATE_GPU(int16);
 INSTANTIATE_GPU(int32);
 INSTANTIATE_GPU(int64);
 INSTANTIATE_GPU(uint8);
 INSTANTIATE_GPU(uint16);
 #undef INSTANTIATE_GPU
 }  // namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/eigen_pooling.h"
 #include "tensorflow/core/kernels/ops_util.h"
@ -56,7 +57,7 @@ template <typename Device, typename T>
 static void SpatialMaxPoolWithArgMaxHelper(
    OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
    Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
-    const PoolParameters& params, const Padding& padding) {
+    const PoolParameters& params) {
  typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
      ConstEigenMatrixMap;
  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
@ -151,7 +152,7 @@ static void SpatialMaxPoolWithArgMaxHelper(
      }
    }
-    {
+    if (input_backprop != nullptr) {
      auto input_backprop_flat = input_backprop->flat<T>();
      auto out_arg_max_flat = output_arg_max->flat<int64>();
      auto out_backprop_flat = out_backprop.flat<T>();
@ -173,9 +174,9 @@ static void SpatialMaxPoolWithArgMaxHelper(
        // Although this check is in the inner loop, it is worth its value
        // so we don't end up with memory corruptions. Our benchmark shows that
        // the performance impact is quite small
-        CHECK(input_backprop_index >= in_start && input_backprop_index < in_end)
+        // CHECK(input_backprop_index >= in_start && input_backprop_index <
-            << "Invalid input backprop index: " << input_backprop_index << ", "
+        // in_end)
-            << in_start << ", " << in_end;
+        FastBoundsCheck(input_backprop_index - in_start, in_end - in_start);
        input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
      }
    }
@ -293,7 +294,7 @@ class MaxPoolingGradOp : public OpKernel {
    SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
        context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
-        out_backprop, params, padding_);
+        out_backprop, params);
  }
 private:
@ -869,6 +870,17 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
 template <typename Device, typename T>
 struct LaunchMaxPoolingWithArgmax;
 template <typename T>
 struct LaunchMaxPoolingWithArgmax<CPUDevice, T> {
  static void launch(OpKernelContext* context, const PoolParameters& params,
                     const Tensor& input, Tensor* output, Tensor* argmax,
                     bool propagate_nans) {
    Tensor unused;
    SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
        context, output, argmax, nullptr, input, unused, params);
  }
 };
 template <typename Device, typename T>
 class MaxPoolingWithArgmaxOp : public OpKernel {
 public:
@ -921,6 +933,53 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
 template <typename Device, typename T>
 struct LaunchMaxPoolingGradWithArgmax;
 template <typename T>
 struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
      EigenMatrixMap;
  static void launch(OpKernelContext* context, const PoolParameters& params,
                     const Tensor& grad_in, const Tensor& argmax,
                     Tensor* grad_out) {
    const DeviceBase::CpuWorkerThreads& worker_threads =
        *(context->device()->tensorflow_cpu_worker_threads());
    auto shard = [&grad_in, &argmax, &grad_out](int64 start, int64 limit) {
      const int64 batch_size =
          GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
      const int64 output_size_per_batch = grad_out->NumElements() / batch_size;
      const int64 input_size_per_batch = grad_in.NumElements() / batch_size;
      {
        auto grad_out_flat = grad_out->flat<T>();
        auto argmax_flat = argmax.flat<int64>();
        auto grad_in_flat = grad_in.flat<T>();
        const int64 output_start = start * output_size_per_batch;
        const int64 output_end = limit * output_size_per_batch;
        EigenMatrixMap inputShard(grad_out_flat.data() + output_start, 1,
                                  output_end - output_start);
        inputShard.setConstant(T(0));
        const int input_start = start * input_size_per_batch;
        const int input_end = limit * input_size_per_batch;
        for (int64 index = input_start; index < input_end; index++) {
          const int64 grad_out_index = argmax_flat(index);
          CHECK(grad_out_index >= output_start && grad_out_index < output_end)
              << "Invalid output gradient index: " << grad_out_index << ", "
              << output_start << ", " << output_end;
          grad_out_flat(grad_out_index) += grad_in_flat(index);
        }
      }
    };
    const int64 batch_size = GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
    const int64 shard_cost = grad_out->NumElements() / batch_size;
    Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
          shard_cost, shard);
  }
 };
 template <typename Device, typename T>
 class MaxPoolingGradWithArgmaxOp : public OpKernel {
 public:
@ -1309,7 +1368,17 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
                              .HostMemory("ksize")                       \
                              .HostMemory("strides")                     \
                              .TypeConstraint<T>("T"),                   \
-                          MaxPoolingGradGradOp<D##Device, T>);
+                          MaxPoolingGradGradOp<D##Device, T>)            \
  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                      \
                              .Device(DEVICE_##D)                        \
                              .TypeConstraint<int64>("Targmax")          \
                              .TypeConstraint<T>("T"),                   \
                          MaxPoolingWithArgmaxOp<D##Device, T>);         \
  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")                  \
                              .Device(DEVICE_##D)                        \
                              .TypeConstraint<T>("T")                    \
                              .TypeConstraint<int64>("Targmax"),         \
                          MaxPoolingGradWithArgmaxOp<D##Device, T>);
 // Below kernels implemented only for CPU device.
 #define REGISTER_CPU_ONLY_POOL_KERNELS(T)                          \
@ -1374,16 +1443,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
                              .HostMemory("strides")                 \
                              .TypeConstraint<T>("T"),               \
                          MaxPoolingNoMaskV2Op<GPUDevice, T>);       \
  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
                              .Device(DEVICE_GPU)                    \
                              .TypeConstraint<int64>("Targmax")      \
                              .TypeConstraint<T>("T"),               \
                          MaxPoolingWithArgmaxOp<GPUDevice, T>);     \
  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")              \
                              .Device(DEVICE_GPU)                    \
                              .TypeConstraint<T>("T")                \
                              .TypeConstraint<int64>("Targmax"),     \
                          MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \
  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
                              .Device(DEVICE_GPU)                    \
                              .TypeConstraint<T>("T")                \
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@ -16,6 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 // Unfortunately we can't add the #include, since it breaks compilation for
 // non-GPU targets. This only breaks in clang, because it's more strict for
 // template code and CudaAtomicMax is used in template context.
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@ -117,7 +117,11 @@ REGISTER_OP("TakeDataset")
    .Output("handle: variant")
    .Attr("output_types: list(type) >= 1")
    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
      shape_inference::ShapeHandle count_shape;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
      return shape_inference::ScalarShape(c);
    });
 REGISTER_OP("SkipDataset")
    .Input("input_dataset: variant")
@ -125,7 +129,11 @@ REGISTER_OP("SkipDataset")
    .Output("handle: variant")
    .Attr("output_types: list(type) >= 1")
    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
      shape_inference::ShapeHandle count_shape;
      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
      return shape_inference::ScalarShape(c);
    });
 REGISTER_OP("BytesProducedStatsDataset")
    .Input("input_dataset: variant")
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@ -1558,6 +1558,14 @@ REGISTER_OP("Bucketize")
    .Attr("boundaries: list(float)")
    .SetShapeFn(shape_inference::UnchangedShape);
 REGISTER_OP("ClipByValue")
    .Input("t: T")
    .Input("clip_value_min: T")
    .Input("clip_value_max: T")
    .Output("output: T")
    .Attr("T: numbertype")
    .SetShapeFn(shape_inference::UnchangedShape);
 #ifdef INTEL_MKL
 REGISTER_OP("_MklAddN")
    .Input("inputs: N * T")
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@ -31,13 +31,14 @@ limitations under the License.
  __attribute__((__format__(__printf__, string_index, first_to_check)))
 #define TF_SCANF_ATTRIBUTE(string_index, first_to_check) \
  __attribute__((__format__(__scanf__, string_index, first_to_check)))
-#elif defined(COMPILER_MSVC)
+#elif defined(_MSC_VER)
 // Non-GCC equivalents
 #define TF_ATTRIBUTE_NORETURN __declspec(noreturn)
-#define TF_ATTRIBUTE_ALWAYS_INLINE
+#define TF_ATTRIBUTE_ALWAYS_INLINE __forceinline
 #define TF_ATTRIBUTE_NOINLINE
 #define TF_ATTRIBUTE_UNUSED
 #define TF_ATTRIBUTE_COLD
 #define TF_ATTRIBUTE_WEAK
 #define TF_MUST_USE_RESULT
 #define TF_PACKED
 #define TF_PRINTF_ATTRIBUTE(string_index, first_to_check)
@ -57,7 +58,7 @@ limitations under the License.
 #endif
 // Control visiblity outside .so
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_EXPORT __declspec(dllexport)
 #else
@ -65,7 +66,7 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
 #ifdef __has_builtin
 #define TF_HAS_BUILTIN(x) __has_builtin(x)
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@ -148,19 +148,7 @@ viewing. Do not include url parameters in the source code URL.
 Before building the documentation, you must first set up your environment by
 doing the following:
-1. If pip isn't installed on your machine, install it now by issuing the
+1. If bazel is not installed on your machine, install it now. If you are on
 following command:
        $ sudo easy_install pip
 2. Use pip to install codegen, mock, and pandas by issuing the following
   command (Note: If you are using
   a [virtualenv](https://virtualenv.pypa.io/en/stable/) to manage your
   dependencies, you may not want to use sudo for these installations):
        $ sudo pip install codegen mock pandas
 3. If bazel is not installed on your machine, install it now. If you are on
   Linux, install bazel by issuing the following command:
        $ sudo apt-get install bazel  # Linux
@ -168,10 +156,10 @@ following command:
    If you are on Mac OS, find bazel installation instructions on
    [this page](https://bazel.build/versions/master/docs/install.html#mac-os-x).
-4. Change directory to the top-level `tensorflow` directory of the TensorFlow
+2. Change directory to the top-level `tensorflow` directory of the TensorFlow
   source code.
-5. Run the `configure` script and answer its prompts appropriately for your
+3. Run the `configure` script and answer its prompts appropriately for your
   system.
        $ ./configure
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@ -530,56 +530,58 @@ form [described below](#attr_types).
 For example, if you'd like the `ZeroOut` op to preserve a user-specified index,
 instead of only the 0th element, you can register the op like so:
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
-REGISTER\_OP("ZeroOut")
+REGISTER_OP("ZeroOut")
-    <b>.Attr("preserve\_index: int")</b>
+    .Attr("preserve_index: int")
-    .Input("to\_zero: int32")
+    .Input("to_zero: int32")
    .Output("zeroed: int32");
-</code></pre>
+```
 (Note that the set of [attribute types](#attr_types) is different from the
@{tf.DType$tensor types} used for inputs and outputs.)
 Your kernel can then access this attr in its constructor via the `context`
 parameter:
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
 class ZeroOutOp : public OpKernel {
 public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {<b>
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {
    // Get the index of the value to preserve
-    OP\_REQUIRES\_OK(context,
+    OP_REQUIRES_OK(context,
-                   context-&gt;GetAttr("preserve\_index", &preserve\_index\_));
+                   context->GetAttr("preserve_index", &preserve_index_));
-    // Check that preserve\_index is positive
+    // Check that preserve_index is positive
-    OP\_REQUIRES(context, preserve\_index_ &gt;= 0,
+    OP_REQUIRES(context, preserve_index_ >= 0,
-                errors::InvalidArgument("Need preserve\_index &gt;= 0, got ",
+                errors::InvalidArgument("Need preserve_index >= 0, got ",
-                                        preserve\_index_));
+                                        preserve_index_));
-  </b>}
+  }
-  void Compute(OpKernelContext\* context) override {
+  void Compute(OpKernelContext* context) override {
    // ...
  }
- <b>private:
+ private:
-  int preserve\_index\_;</b>
+  int preserve_index_;
 };
-</code></pre>
+```
 which can then be used in the `Compute` method:
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
-  void Compute(OpKernelContext\* context) override {
+  void Compute(OpKernelContext* context) override {
    // ...
-<br/>
+
-    <b>// We're using saved attr to validate potentially dynamic input
+    // We're using saved attr to validate potentially dynamic input
-    // So we check that preserve\_index is in range
+    // So we check that preserve_index is in range
-    OP\_REQUIRES(context, preserve\_index_ &lt; input.dimension(0),
+    OP_REQUIRES(context, preserve_index_ < input.dimension(0),
-                errors::InvalidArgument("preserve\_index out of range"));<br/>
+                errors::InvalidArgument("preserve_index out of range"));
-    </b>// Set all the elements of the output tensor to 0
+
    // Set all the elements of the output tensor to 0
    const int N = input.size();
    for (int i = 0; i < N; i++) {
      output\_flat(i) = 0;
-    }<br/>
+    }
-    <b>// Preserve the requested input value
+
-    output\_flat(preserve\_index\_) = input(preserve\_index\_);</b>
+    // Preserve the requested input value
    output_flat(preserve_index_) = input(preserve_index_);
  }
-</code></pre>
+```
 #### Attr types
@ -725,12 +727,12 @@ you would then register an `OpKernel` for each supported type.
 For instance, if you'd like the `ZeroOut` op to work on `float`s
 in addition to `int32`s, your op registration might look like:
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
-REGISTER\_OP("ZeroOut")
+REGISTER_OP("ZeroOut")
-    <b>.Attr("T: {float, int32}")</b>
+    .Attr("T: {float, int32}")
-    .Input("to\_zero: <b>T</b>")
+    .Input("to_zero: T")
-    .Output("zeroed: <b>T</b>");
+    .Output("zeroed: T");
-</code></pre>
+```
 Your op registration now specifies that the input's type must be `float`, or
 `int32`, and that its output will be the same type, since both have type `T`.
@ -790,66 +792,73 @@ Your op registration now specifies that the input's type must be `float`, or
 >   """
 > ```
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
-\#include "tensorflow/core/framework/op_kernel.h"<br/>
+#include "tensorflow/core/framework/op_kernel.h"
-class ZeroOut<b>Int32</b>Op : public OpKernel {
+
 class ZeroOutInt32Op : public OpKernel {
  // as before
-};<br/>
+};
-class ZeroOut<b>Float</b>Op : public OpKernel {
+
 class ZeroOutFloatOp : public OpKernel {
 public:
-  explicit ZeroOut<b>Float</b>Op(OpKernelConstruction\* context)
+  explicit ZeroOutFloatOp(OpKernelConstruction* context)
-      : OpKernel(context) {}<br/>
+      : OpKernel(context) {}
-  void Compute(OpKernelContext\* context) override {
+
  void Compute(OpKernelContext* context) override {
    // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
+    const Tensor& input_tensor = context->input(0);
-    auto input = input\_tensor.flat&lt;<b>float</b>&gt;();<br/>
+    auto input = input_tensor.flat<float>();
    // Create an output tensor
    Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
+    OP_REQUIRES_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
+                   context->allocate_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat&lt;<b>float</b>&gt;();<br/>
+    auto output_flat = output->template flat<float>();
    // Set all the elements of the output tensor to 0
    const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
+    for (int i = 0; i < N; i++) {
-      output\_flat(i) = 0;
+      output_flat(i) = 0;
-    }<br/>
+    }
    // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
  }
-};<br/><b>
+};
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+
 // Note that TypeConstraint<int32>("T") means that attr "T" (defined
 // in the op registration above) must be "int32" to use this template
-// instantiation.</b>
+// instantiation.
-REGISTER\_KERNEL\_BUILDER(
+REGISTER_KERNEL_BUILDER(
    Name("ZeroOut")
-    .Device(DEVICE\_CPU)
+    .Device(DEVICE_CPU)
-    <b>.TypeConstraint&lt;int32&gt;("T"),</b>
+    .TypeConstraint<int32>("T"),
-    ZeroOutOp<b>Int32</b>);
+    ZeroOutOpInt32);
-<b>REGISTER\_KERNEL\_BUILDER(
+REGISTER_KERNEL_BUILDER(
    Name("ZeroOut")
-    .Device(DEVICE\_CPU)
+    .Device(DEVICE_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
+    .TypeConstraint<float>("T"),
    ZeroOutFloatOp);
-</b></code></pre>
+```
 > To preserve [backwards compatibility](#backwards-compatibility), you should
 > specify a [default value](#default-values-constraints) when adding an attr to
 > an existing op:
 >
-> <pre class="prettyprint"><code class="lang-cpp">
+> ```c++
-> REGISTER\_OP("ZeroOut")
+> REGISTER_OP("ZeroOut")
->   <b>.Attr("T: {float, int32} = DT_INT32")</b>
+>   .Attr("T: {float, int32} = DT_INT32")
->   .Input("to\_zero: T")
+>   .Input("to_zero: T")
 >   .Output("zeroed: T")
-> </code></pre>
+> ```
 Let's say you wanted to add more types, say `double`:
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
-REGISTER\_OP("ZeroOut")
+REGISTER_OP("ZeroOut")
-    <b>.Attr("T: {float, <b>double,</b> int32}")</b>
+    .Attr("T: {float, double, int32}")
-    .Input("to\_zero: <b>T</b>")
+    .Input("to_zero: T")
-    .Output("zeroed: <b>T</b>");
+    .Output("zeroed: T");
-</code></pre>
+```
 Instead of writing another `OpKernel` with redundant code as above, often you
 will be able to use a C++ template instead.  You will still have one kernel
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@ -546,7 +546,7 @@ In brief, here's what the three graphs tell you:
 * accuracy: The accuracy is recorded by the following two lines:
-    * `eval_metric_ops={'my_accuracy': accuracy})`, during evaluation.
+    * `eval_metric_ops={'my_accuracy': accuracy}`, during evaluation.
    * `tf.summary.scalar('accuracy', accuracy[1])`, during training.
 These tensorboard graphs are one of the main reasons it's important to pass a
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@ -113,6 +113,6 @@ If executing `a.out` fails, ask yourself the following questions:
  * Did you export those environment variables?
 If you are still seeing build or execution error messages, search (or post to)
-[StackOverflow](www.stackoverflow.com/questions/tagged/tensorflow) for
+[StackOverflow](https://stackoverflow.com/questions/tagged/tensorflow) for
 possible solutions.
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@ -475,7 +475,7 @@ optimizations.
 ### TensorFlow with Intel® MKL DNN
 Intel® has added optimizations to TensorFlow for Intel® Xeon® and Intel® Xeon
-Phi™ though the use of Intel® Math Kernel Library for Deep Neural Networks
+Phi™ through the use of the Intel® Math Kernel Library for Deep Neural Networks
 (Intel® MKL-DNN) optimized primitives. The optimizations also provide speedups
 for the consumer line of processors, e.g. i5 and i7 Intel processors. The Intel
 published paper
@ -581,9 +581,9 @@ Each variable that impacts performance is discussed below.
    for optimal settings.
 *   **intra_op_parallelism_threads**: Setting this equal to the number of
-    physical cores is recommended. Setting the value to 0, which is the default
+    physical cores is recommended. Setting the value to 0, which is the default,
-    and will result in the value being set to the number of logical cores, is an
+    results in the value being set to the number of logical cores - this is an
-    option to try for some architectures.  This value and `OMP_NUM_THREADS`
+    alternate option to try for some architectures.  This value and `OMP_NUM_THREADS`
    should be equal.
 *   **inter_op_parallelism_threads**: Setting this equal to the number of
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@ -4,29 +4,28 @@
 [TOC]
-TensorFlow debugger (**tfdbg**) is a specialized debugger for TensorFlow. It
+`tfdbg` is a specialized debugger for TensorFlow. It lets you view the internal
-lets you view the internal structure and states of running TensorFlow graphs
+structure and states of running TensorFlow graphs during training and inference,
-during training and inference, which is difficult to debug with general-purpose
+which is difficult to debug with general-purpose debuggers such as Python's `pdb`
-debuggers such as Python's `pdb` due to TensorFlow's computation-graph paradigm.
+due to TensorFlow's computation-graph paradigm.
-> NOTE: TensorFlow debugger uses a
+This guide focuses on the command-line interface (CLI) of `tfdbg`. For guide on
-> [curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based
+how to use the graphical user interface (GUI) of tfdbg, i.e., the
-> text user interface. On Mac OS X, the `ncurses` library is required and can
+**TensorBoard Debugger Plugin**, please visit
-> be installed with `brew install homebrew/dupes/ncurses`. On Windows, curses
+[its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
 > isn't as well supported, so a
 > [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based interface can
 > be used with tfdbg by installing `pyreadline` with pip.
 > If you use Anaconda3, you can install it with a command
 > such as `"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`.
 > Unofficial Windows curses packages can be downloaded
 > [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
 > installed using `pip install <your_version>.whl`, however curses on Windows
 > may not work as reliably as curses on Linux or Mac.
-> NOTE: This guide focuses on the command-line interface (CLI) of tfdbg. For
+Note: The TensorFlow debugger uses a
-> guide on how to use the graphical user interface (GUI) of tfdbg, i.e., the
+[curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based text
-> **TensorBoard Debugger Plugin**, please visit
+user interface. On Mac OS X, the `ncurses` library is required and can be
-> [its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
+installed with `brew install homebrew/dupes/ncurses`. On Windows, curses isn't as
 well supported, so a [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based
 interface can be used with tfdbg by installing `pyreadline` with `pip`. If you
 use Anaconda3, you can install it with a command such as
 `"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`. Unofficial
 Windows curses packages can be downloaded
 [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
 installed using `pip install <your_version>.whl`, however curses on Windows may
 not work as reliably as curses on Linux or Mac.
 This tutorial demonstrates how to use the **tfdbg** CLI to debug the appearance
 of [`nan`s](https://en.wikipedia.org/wiki/NaN)
@ -748,16 +747,16 @@ There are three possible workarounds or solutions:
   to which tfdbg dumps the debug data. You can use it to let tfdbg dump the
   debug data on a disk with larger free space. For example:
-   ``` python
+```python
-   # For LocalCLIDebugWrapperSession
+# For LocalCLIDebugWrapperSession
-   sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
+sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
   # For LocalCLIDebugHook
   hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
   ```
 # For LocalCLIDebugHook
 hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
 ```
   Make sure that the directory pointed to by dump_root is empty or nonexistent.
-   tfdbg cleans up the dump directories before exiting.
+   `tfdbg` cleans up the dump directories before exiting.
 *  Reduce the batch size used during the runs.
 *  Use the filtering options of tfdbg's `run` command to watch only specific
   nodes in the graph. For example:
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -835,6 +835,7 @@ py_library(
    srcs = ["framework/tensor_shape.py"],
    srcs_version = "PY2AND3",
    deps = [
        ":dtypes",
        ":util",
        "//tensorflow/core:protos_all_py",
    ],
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@ -651,6 +651,11 @@ QUANTIZED_DTYPES = frozenset([
 ])
 tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 _PYTHON_TO_TF = {
    float: float32,
    bool: bool,
 }
@tf_export("as_dtype")
 def as_dtype(type_value):
@ -682,6 +687,11 @@ def as_dtype(type_value):
  except KeyError:
    pass
  try:
    return _PYTHON_TO_TF[type_value]
  except KeyError:
    pass
  if isinstance(type_value, np.dtype):
    # The numpy dtype for strings is variable length. We can not compare
    # dtype with a single constant (np.string does not exist) to decide
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@ -295,6 +295,10 @@ class TypesTest(test_util.TensorFlowTestCase):
    self.assertNotEqual(dtypes.int32, int)
    self.assertNotEqual(dtypes.float64, 2.1)
  def testPythonTypesConversion(self):
    self.assertIs(dtypes.float32, dtypes.as_dtype(float))
    self.assertIs(dtypes.bool, dtypes.as_dtype(bool))
  def testReduce(self):
    for enum in dtypes._TYPE_TO_STRING:
      dtype = dtypes.DType(enum)
@ -307,3 +311,4 @@ class TypesTest(test_util.TensorFlowTestCase):
 if __name__ == "__main__":
  googletest.main()
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@ -37,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_logging_ops
@ -1362,7 +1361,7 @@ class UnrollLSTMTest(test.TestCase):
        value=math_ops.matmul(xm, weights), num_or_size_splits=4, axis=1)
    new_c = math_ops.sigmoid(f_g) * cprev + math_ops.sigmoid(
        i_g) * math_ops.tanh(i_i)
-    new_c = clip_ops.clip_by_value(new_c, -50.0, 50.0)
+    new_c = math_ops.maximum(math_ops.minimum(new_c, 50.0), -50.0)
    new_m = math_ops.sigmoid(o_g) * math_ops.tanh(new_c)
    return new_m, new_c
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
@ -30,6 +31,8 @@ class Dimension(object):
    """Creates a new Dimension with the given value."""
    if value is None:
      self._value = None
    elif isinstance(value, dtypes.DType):
      raise TypeError("Cannot convert %s to Dimension" % value)
    else:
      self._value = int(value)
      if (not isinstance(value, compat.bytes_or_text_types) and
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@ -184,6 +185,10 @@ class DimensionTest(test_util.TensorFlowTestCase):
    self.assertEqual(str(tensor_shape.Dimension(7)), "7")
    self.assertEqual(str(tensor_shape.Dimension(None)), "?")
  def testUnsupportedType(self):
    with self.assertRaises(TypeError):
      tensor_shape.Dimension(dtypes.string)
  def testMod(self):
    four = tensor_shape.Dimension(4)
    nine = tensor_shape.Dimension(9)
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@ -19,9 +19,9 @@ from __future__ import division
 from __future__ import print_function
 from collections import defaultdict
 import sys
 import numpy as np
 import six
 from tensorflow.python.util.tf_export import tf_export
@ -160,13 +160,11 @@ def ask_to_proceed_with_overwrite(filepath):
  Returns:
      True if we can proceed with overwrite, False otherwise.
  """
-  get_input = input
+  overwrite = six.moves.input('[WARNING] %s already exists - overwrite? '
-  if sys.version_info[:2] <= (2, 7):
+                              '[y/n]' % (filepath)).strip().lower()
-    get_input = raw_input
+  while overwrite not in ('y', 'n'):
-  overwrite = get_input('[WARNING] %s already exists - overwrite? '
+    overwrite = six.moves.input('Enter "y" (overwrite) or "n" '
-                        '[y/n]' % (filepath))
+                                '(cancel).').strip().lower()
  while overwrite not in ['y', 'n']:
    overwrite = get_input('Enter "y" (overwrite) or "n" (cancel).')
  if overwrite == 'n':
    return False
  print('[TIP] Next time specify overwrite=True!')
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@ -19,16 +19,33 @@ from __future__ import division
 from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
 class ClipTest(test.TestCase):
  def DISABLED_testClipByValueGradient(self):
    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
    outputs_1 = clip_ops.clip_by_value(inputs, 0.5, 3.5)
    min_val = constant_op.constant([0.5, 0.5, 0.5, 0.5], dtype=dtypes.float32)
    max_val = constant_op.constant([3.5, 3.5, 3.5, 3.5], dtype=dtypes.float32)
    outputs_2 = clip_ops.clip_by_value(inputs, min_val, max_val)
    with self.test_session():
      error_1 = gradient_checker.compute_gradient_error(inputs, [4], outputs_1,
                                                        [4])
      self.assertLess(error_1, 1e-4)
      error_2 = gradient_checker.compute_gradient_error(inputs, [4], outputs_2,
                                                        [4])
      self.assertLess(error_2, 1e-4)
  # ClipByValue test
  def testClipByValue(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
      np_ans = [[-4.4, 2.0, 3.0], [4.0, 4.4, 4.4]]
      clip_value = 4.4
@ -37,8 +54,76 @@ class ClipTest(test.TestCase):
    self.assertAllClose(np_ans, tf_ans)
  # [Tensor, Scalar, Scalar]
  def DISABLED_testClipByValue0Type(self):
    for dtype in [
        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
    ]:
      with self.test_session(use_gpu=True):
        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
        np_ans = [[2, 2, 3], [4, 4, 4]]
        clip_value_min = 2
        clip_value_max = 4
        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
        tf_ans = ans.eval()
      self.assertAllClose(np_ans, tf_ans)
  # [Tensor, Tensor, Scalar]
  def DISABLED_testClipByValue1Type(self):
    for dtype in [
        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
    ]:
      with self.test_session(use_gpu=True):
        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
        np_ans = [[2, 2, 3], [4, 4, 4]]
        clip_value_min = constant_op.constant(
            [2, 2, 2, 3, 3, 3], shape=[2, 3], dtype=dtype)
        clip_value_max = 4
        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
        tf_ans = ans.eval()
      self.assertAllClose(np_ans, tf_ans)
  # [Tensor, Scalar, Tensor]
  def DISABLED_testClipByValue2Type(self):
    for dtype in [
        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
    ]:
      with self.test_session(use_gpu=True):
        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
        np_ans = [[4, 4, 4], [4, 5, 6]]
        clip_value_min = 4
        clip_value_max = constant_op.constant(
            [6, 6, 6, 6, 6, 6], shape=[2, 3], dtype=dtype)
        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
        tf_ans = ans.eval()
      self.assertAllClose(np_ans, tf_ans)
  # [Tensor, Tensor, Tensor]
  def DISABLED_testClipByValue3Type(self):
    for dtype in [
        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
    ]:
      with self.test_session(use_gpu=True):
        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
        np_ans = [[2, 2, 3], [5, 5, 6]]
        clip_value_min = constant_op.constant(
            [2, 2, 2, 5, 5, 5], shape=[2, 3], dtype=dtype)
        clip_value_max = constant_op.constant(
            [5, 5, 5, 7, 7, 7], shape=[2, 3], dtype=dtype)
        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
        tf_ans = ans.eval()
      self.assertAllClose(np_ans, tf_ans)
  def testClipByValueBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
      # Use a nonsensical shape.
      clip = constant_op.constant([1.0, 2.0])
@ -48,6 +133,7 @@ class ClipTest(test.TestCase):
        _ = clip_ops.clip_by_value(x, 1.0, clip)
  def testClipByValueNonFinite(self):
    # TODO(b/78016351): Enable test on GPU once the bug is fixed.
    with self.test_session():
      x = constant_op.constant([float('NaN'), float('Inf'), -float('Inf')])
      np_ans = [float('NaN'), 4.0, -4.0]
@ -60,7 +146,7 @@ class ClipTest(test.TestCase):
  # ClipByNorm tests
  def testClipByNormClipped(self):
    # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      # Norm of x = sqrt(3^2 + 4^2) = 5
      np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
@ -76,7 +162,7 @@ class ClipTest(test.TestCase):
    self.assertAllClose(np_ans, tf_ans_tensor)
  def testClipByNormBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3, 1])
      # Use a nonsensical shape.
      clip = constant_op.constant([1.0, 2.0])
@ -85,7 +171,7 @@ class ClipTest(test.TestCase):
  def testClipByNormNotClipped(self):
    # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      # Norm of x = sqrt(3^2 + 4^2) = 5
      np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@ -97,7 +183,7 @@ class ClipTest(test.TestCase):
  def testClipByNormZero(self):
    # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
      # Norm = 0, no changes
      np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
@ -109,7 +195,7 @@ class ClipTest(test.TestCase):
  def testClipByNormClippedWithDim0(self):
    # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
      # Norm of x[:, 0] = sqrt(3^2 + 4^2) = 5, x[:, 2] = 3
      np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
@ -121,7 +207,7 @@ class ClipTest(test.TestCase):
  def testClipByNormClippedWithDim1(self):
    # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
      # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
      np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
@ -133,7 +219,7 @@ class ClipTest(test.TestCase):
  def testClipByNormNotClippedWithAxes(self):
    # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
      # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
      np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
@ -146,7 +232,7 @@ class ClipTest(test.TestCase):
  # ClipByGlobalNorm tests
  def testClipByGlobalNormClipped(self):
    # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@ -167,7 +253,7 @@ class ClipTest(test.TestCase):
  def testClipByGlobalNormClippedTensor(self):
    # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@ -188,7 +274,7 @@ class ClipTest(test.TestCase):
  def testClipByGlobalNormSupportsNone(self):
    # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@ -211,7 +297,7 @@ class ClipTest(test.TestCase):
  def testClipByGlobalNormWithIndexedSlicesClipped(self):
    # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = ops.IndexedSlices(
          constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4]))
@ -244,7 +330,7 @@ class ClipTest(test.TestCase):
  def testClipByGlobalNormNotClipped(self):
    # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@ -263,7 +349,7 @@ class ClipTest(test.TestCase):
  def testClipByGlobalNormZero(self):
    # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([0.0, 0.0])
      # Norm = 0, no changes
@ -282,7 +368,7 @@ class ClipTest(test.TestCase):
  def testClipByAverageNormClipped(self):
    # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
      np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@ -294,7 +380,7 @@ class ClipTest(test.TestCase):
  def testClipByAverageNormClippedTensor(self):
    # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
      np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@ -306,7 +392,7 @@ class ClipTest(test.TestCase):
  def testClipByAverageNormNotClipped(self):
    # No norm clipping when average clip_norm >= 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
      np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@ -318,7 +404,7 @@ class ClipTest(test.TestCase):
  def testClipByAverageNormZero(self):
    # No norm clipping when average clip_norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
      x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
      # Average norm = 0, no changes
      np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@ -817,9 +817,6 @@ class PoolingTest(test.TestCase):
          cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
  def testMaxPoolingWithArgmax(self):
    # MaxPoolWithArgMax is implemented only on CUDA.
    if not test.is_gpu_available(cuda_only=True):
      return
    tensor_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
    with self.test_session(use_gpu=True) as sess:
      t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
@ -836,9 +833,6 @@ class PoolingTest(test.TestCase):
      self.assertAllEqual(argmax.ravel(), [0, 1, 3, 5])
  def testMaxPoolingGradWithArgmax(self):
    # MaxPoolWithArgMax is implemented only on CUDA.
    if not test.is_gpu_available(cuda_only=True):
      return
    orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
    tensor_input = [11.0, 12.0, 13.0, 14.0]
    tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@ -70,6 +71,35 @@ def clip_by_value(t, clip_value_min, clip_value_max,
    _ = t.shape.merge_with(t_max.shape)
  return t_max
  # TODO(scottzhu): switch to use new implmentation in 2 weeks.
    # return gen_math_ops.clip_by_value(
    #     t, clip_value_min, clip_value_max, name=name)
 # TODO(scottzhu): switch to use new implmentation in 2 weeks.
 # @ops.RegisterGradient("ClipByValue")
 def _clip_by_value_grad(op, grad):
  """Returns grad of clip_by_value."""
  x = op.inputs[0]
  y = op.inputs[1]
  z = op.inputs[2]
  gdtype = grad.dtype
  sx = array_ops.shape(x)
  sy = array_ops.shape(y)
  sz = array_ops.shape(z)
  gradshape = array_ops.shape(grad)
  zeros = array_ops.zeros(gradshape, gdtype)
  xymask = math_ops.less(x, y)
  xzmask = math_ops.greater(x, z)
  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
  rx, rz = gen_array_ops.broadcast_gradient_args(sx, sz)
  xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad)
  ygrad = array_ops.where(xymask, grad, zeros)
  zgrad = array_ops.where(xzmask, grad, zeros)
  gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
  gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy)
  gz = array_ops.reshape(math_ops.reduce_sum(zgrad, rz), sz)
  return (gx, gy, gz)
@tf_export("clip_by_norm")
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@ -0,0 +1,395 @@
 # array_ops
 BatchToSpace
 BroadcastArgs
 BroadcastGradientArgs
 ConcatOffset
 Concat
 ConcatV2
 ConjugateTranspose
 Const
 DebugGradientIdentity
 DebugGradientRefIdentity
 EditDistance
 ExpandDims
 ListDiff
 MirrorPad
 MirrorPadGrad
 OneHot
 Pack
 Pad
 PadV2
 ParallelConcat
 Placeholder
 RefIdentity
 Reverse
 Snapshot
 SpaceToBatch
 Split
 SplitV
 Squeeze
 Slice
 TileGrad  # Exported through array_grad instead of array_ops.
 ZerosLike  # TODO(josh11b): Use this instead of the Python version.
 Unique
 UniqueV2
 UniqueWithCounts
 UniqueWithCountsV2
 Unpack
 # candidate_sampling_ops
 AllCandidateSampler
 ComputeAccidentalHits
 FixedUnigramCandidateSampler
 LearnedUnigramCandidateSampler
 LogUniformCandidateSampler
 ThreadUnsafeUnigramCandidateSampler
 UniformCandidateSampler
 # checkpoint_ops
 GenerateVocabRemapping
 LoadAndRemapMatrix
 # control_flow_ops
 Switch
 Merge
 RefMerge
 Exit
 RefExit
 # ctc_ops
 CTCLoss
 CTCGreedyDecoder
 CTCBeamSearchDecoder
 # data_flow_ops
 Barrier
 BarrierClose
 BarrierIncompleteSize
 BarrierInsertMany
 BarrierReadySize
 BarrierTakeMany
 DeleteSessionTensor
 FakeQueue
 FIFOQueue
 FIFOQueueV2
 GetSessionHandle
 GetSessionHandleV2
 GetSessionTensor
 HashTable
 HashTableV2
 InitializeTable
 InitializeTableV2
 InitializeTableFromTextFile
 InitializeTableFromTextFileV2
 LookupTableExport
 LookupTableExportV2
 LookupTableFind
 LookupTableFindV2
 LookupTableImport
 LookupTableImportV2
 LookupTableInsert
 LookupTableInsertV2
 LookupTableSize
 LookupTableSizeV2
 MutableDenseHashTable
 MutableDenseHashTableV2
 MutableHashTable
 MutableHashTableV2
 MutableHashTableOfTensors
 MutableHashTableOfTensorsV2
 Mutex
 MutexAcquire
 MutexRelease
 PaddingFIFOQueue
 PaddingFIFOQueueV2
 PriorityQueue
 PriorityQueueV2
 QueueClose
 QueueCloseV2
 QueueDequeue
 QueueDequeueV2
 QueueDequeueMany
 QueueDequeueManyV2
 QueueDequeueUpTo
 QueueDequeueUpToV2
 QueueEnqueue
 QueueEnqueueV2
 QueueEnqueueMany
 QueueEnqueueManyV2
 QueueSize
 QueueSizeV2
 RandomShuffleQueue
 RandomShuffleQueueV2
 Stack
 StackClose
 StackPop
 StackPush
 StackV2
 StackCloseV2
 StackPopV2
 StackPushV2
 TensorArray
 TensorArrayClose
 TensorArrayCloseV2
 TensorArrayConcat
 TensorArrayConcatV2
 TensorArrayGather
 TensorArrayGatherV2
 TensorArrayGrad
 TensorArrayGradV2
 TensorArrayPack
 TensorArrayPackV2
 TensorArrayRead
 TensorArrayReadV2
 TensorArrayScatter
 TensorArrayScatterV2
 TensorArraySize
 TensorArraySizeV2
 TensorArraySplit
 TensorArraySplitV2
 TensorArrayUnpack
 TensorArrayUnpackV2
 TensorArrayV2
 TensorArrayWrite
 TensorArrayWriteV2
 TensorArrayV3
 TensorArrayCloseV3
 TensorArrayConcatV3
 TensorArrayGatherV3
 TensorArrayGradV3
 TensorArrayReadV3
 TensorArrayPackV3
 TensorArrayScatterV3
 TensorArraySizeV3
 TensorArraySplitV3
 TensorArrayUnpackV3
 TensorArrayWriteV3
 # functional_ops
 SymbolicGradient
 # image_ops
 AdjustContrastv2
 NonMaxSuppression
 NonMaxSuppressionV2
 RandomCrop
 ResizeBilinearGrad
 ResizeBicubicGrad
 ResizeNearestNeighborGrad
 SampleDistortedBoundingBox
 SampleDistortedBoundingBoxV2
 ScaleImageGrad
 # io_ops
 FixedLengthRecordReader
 IdentityReader
 ReaderNumRecordsProduced
 ReaderNumWorkUnitsCompleted
 ReaderRead
 ReaderReadUpTo
 ReaderReset
 ReaderRestoreState
 ReaderSerializeState
 ReaderWorkQueueLength
 FixedLengthRecordReaderV2
 IdentityReaderV2
 ReaderNumRecordsProducedV2
 ReaderNumWorkUnitsCompletedV2
 ReaderReadV2
 ReaderReadUpToV2
 ReaderResetV2
 ReaderRestoreStateV2
 ReaderSerializeStateV2
 ReaderWorkQueueLengthV2
 Restore
 RestoreSlice
 Save
 SaveSlices
 ShardedFilename
 ShardedFilespec
 TextLineReader
 TFRecordReader
 WholeFileReader
 TextLineReaderV2
 TFRecordReaderV2
 WholeFileReaderV2
 LMDBReader
 DecodeCSV
 # linalg_ops
 BatchCholesky
 BatchCholeskyGrad
 BatchMatrixDeterminant
 BatchMatrixInverse
 BatchMatrixSolve
 BatchMatrixSolveLs
 BatchMatrixTriangularSolve
 BatchSelfAdjointEig
 BatchSelfAdjointEigV2
 BatchSvd
 LogMatrixDeterminant
 MatrixExponential
 MatrixLogarithm
 MatrixSolveLs
 SelfAdjointEig
 SelfAdjointEigV2
 Svd
 # logging_ops
 Assert
 AudioSummary
 AudioSummaryV2
 HistogramSummary
 ImageSummary
 MergeSummary
 Print
 ScalarSummary
 TensorSummary
 TensorSummaryV2
 # math_ops
 Abs
 AccumulateNV2
 AddN
 AddV2
 All
 Any
 BatchMatMul
 BatchFFT
 BatchFFT2D
 BatchFFT3D
 BatchIFFT
 BatchIFFT2D
 BatchIFFT3D
 Bucketize
 ClipByValue
 Complex
 ComplexAbs
 Conj
 FloorDiv
 FloorMod
 HistogramFixedWidth
 Max
 Mean
 Min
 Mul
 Neg
 Pow
 Prod
 Range
 RealDiv
 Select
 SparseMatMul
 Sub
 Sum
 MatMul
 Sigmoid
 Tanh
 SigmoidGrad
 TanhGrad
 InvGrad
 ReciprocalGrad
 SqrtGrad
 RsqrtGrad
 TruncateDiv
 TruncateMod
 # nn_ops
 AvgPoolGrad  # "*Grad" accessible through nn_grad instead of nn_ops.
 AvgPool3DGrad
 BatchNormWithGlobalNormalization
 BatchNormWithGlobalNormalizationGrad
 FusedBatchNorm
 FusedBatchNormV2
 SoftmaxCrossEntropyWithLogits
 SparseSoftmaxCrossEntropyWithLogits
 LRNGrad
 MaxPoolGrad
 MaxPoolGradWithArgmax
 MaxPoolGradGrad
 MaxPoolGradGradWithArgmax
 MaxPool3DGrad
 MaxPool3DGradGrad
 ReluGrad
 Relu6Grad
 EluGrad
 SeluGrad
 SoftplusGrad
 SoftsignGrad
 TopK
 TopKV2
 BiasAdd
 BiasAddV1
 Relu6
 AvgPool
 MaxPool
 MaxPoolV2
 Softmax
 LogSoftmax
 FractionalAvgPoolGrad
 FractionalMaxPoolGrad
 InTopK
 InTopKV2
 # parsing_ops
 ParseExample
 ParseSingleSequenceExample
 # random_ops
 RandomGamma
 RandomPoisson
 RandomUniform
 RandomUniformInt
 RandomShuffle
 RandomStandardNormal
 ParameterizedTruncatedNormal
 TruncatedNormal
 # script_ops
 PyFunc
 PyFuncStateless
 EagerPyFunc
 # sdca_ops
 # state_ops
 Variable
 VariableV2
 TemporaryVariable
 DestroyTemporaryVariable
 # sparse_ops
 AddSparseToTensorsMap
 AddManySparseToTensorsMap
 TakeManySparseFromTensorsMap
 DeserializeManySparse
 DeserializeSparse
 SerializeManySparse
 SerializeSparse
 SparseAdd
 SparseAddGrad
 SparseConcat
 SparseCross
 SparseFillEmptyRows
 SparseFillEmptyRowsGrad
 SparseSplit
 SparseSelectLastK
 SparseReorder
 SparseReshape
 SparseToDense
 SparseTensorDenseAdd
 SparseTensorDenseMatMul
 # string_ops
 StringSplit
 # user_ops
 Fact
 # training_ops
 # (None)
 # word2vec deprecated ops
 NegTrain
 Skipgram
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from collections import namedtuple
 import inspect as _inspect
 from tensorflow.python.util import tf_decorator
@ -24,6 +25,15 @@ from tensorflow.python.util import tf_decorator
 ArgSpec = _inspect.ArgSpec
 if hasattr(_inspect, 'FullArgSpec'):
  FullArgSpec = _inspect.FullArgSpec  # pylint: disable=invalid-name
 else:
  FullArgSpec = namedtuple('FullArgSpec', [
      'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults',
      'annotations'
  ])
 def currentframe():
  """TFDecorator-aware replacement for inspect.currentframe."""
  return _inspect.stack()[1][0]
@ -55,13 +65,36 @@ def getfullargspec(obj):  # pylint: disable=redefined-builtin
    obj: A callable, possibly decorated.
  Returns:
-    The `FullArgSpec` (`ArgSpec` in Python 2) that describes the signature of
+    The `FullArgSpec` that describes the signature of
    the outermost decorator that changes the callable's signature. If the
-    callable is not decorated, `inspect.getfullargspec()`
+    callable is not decorated, `inspect.getfullargspec()` will be called
-    (`inspect.getargspec()` in Python 2) will be called directly on the
+    directly on the callable.
    callable.
  """
-  spec_fn = getattr(_inspect, 'getfullargspec', getattr(_inspect, 'getargspec'))
+  if hasattr(_inspect, 'getfullargspec'):
    spec_fn = _inspect.getfullargspec
  else:
    def spec_fn(target):
      """Spec function that adding default value from FullArgSpec.
      It is used when getfullargspec is not available (eg in PY2).
      Args:
        target: the target object to inspect.
      Returns:
        The full argument specs with empty kwonlyargs, kwonlydefaults and
        annotations.
      """
      argspecs = _inspect.getargspec(target)
      fullargspecs = FullArgSpec(
          args=argspecs.args,
          varargs=argspecs.varargs,
          varkw=argspecs.keywords,
          defaults=argspecs.defaults,
          kwonlyargs=[],
          kwonlydefaults=None,
          annotations={})
      return fullargspecs
  decorators, target = tf_decorator.unwrap(obj)
  return next((d.decorator_argspec for d in decorators
               if d.decorator_argspec is not None), spec_fn(target))
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@ -810,7 +810,33 @@ def tf_cc_test_mkl(srcs,
                   tags=[],
                   size="medium",
                   args=None):
-  if_mkl(tf_cc_tests(srcs, deps, name, linkstatic=linkstatic, tags=tags, size=size, args=args, nocopts="-fno-exceptions"))
+  for src in srcs:
    native.cc_test(
      name=src_to_test_name(src),
      srcs=if_mkl([src]) + tf_binary_additional_srcs(),
      copts=tf_copts(),
      linkopts=select({
        clean_dep("//tensorflow:android"): [
            "-pie",
          ],
        clean_dep("//tensorflow:windows"): [],
        clean_dep("//tensorflow:windows_msvc"): [],
        "//conditions:default": [
            "-lpthread",
            "-lm"
        ],
      }) + _rpath_linkopts(src_to_test_name(src)),
      deps=deps + if_mkl(
          [
              "//third_party/mkl:intel_binary_blob",
          ],
      ),
      linkstatic=linkstatic,
      tags=tags,
      size=size,
      args=args,
      nocopts="-fno-exceptions")
 def tf_cc_tests_gpu(srcs,
                    deps,
@ -1029,16 +1055,12 @@ register_extension_info(
 def tf_mkl_kernel_library(name,
                          prefix=None,
                          srcs=None,
                          gpu_srcs=None,
                          hdrs=None,
                          deps=None,
                          alwayslink=1,
                          copts=tf_copts(),
-                          nocopts="-fno-exceptions",
+                          nocopts="-fno-exceptions"):
                          **kwargs):
  """A rule to build MKL-based TensorFlow kernel libraries."""
  gpu_srcs = gpu_srcs  # unused argument
  kwargs = kwargs  # unused argument
  if not bool(srcs):
    srcs = []
@ -1051,16 +1073,15 @@ def tf_mkl_kernel_library(name,
    hdrs = hdrs + native.glob(
        [prefix + "*.h"])
-  if_mkl(
+  native.cc_library(
-      native.cc_library(
+      name=name,
-          name=name,
+      srcs=if_mkl(srcs),
-          srcs=srcs,
+      hdrs=hdrs,
-          hdrs=hdrs,
+      deps=deps,
-          deps=deps,
+      alwayslink=alwayslink,
-          alwayslink=alwayslink,
+      copts=copts,
-          copts=copts,
+      nocopts=nocopts
-          nocopts=nocopts
+  )
      ))
 register_extension_info(
    extension_name = "tf_mkl_kernel_library",
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@ -160,7 +160,8 @@ def get_api_init_text():
  # we want to traverse over TensorFlow Python modules.
  for module in sys.modules.values():
    # Only look at tensorflow modules.
-    if not module or 'tensorflow.' not in module.__name__:
+    if (not module or not hasattr(module, '__name__') or
        'tensorflow.' not in module.__name__):
      continue
    # Do not generate __init__.py files for contrib modules for now.
    if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@ -47,7 +47,7 @@ RUN pip --no-cache-dir install \
    http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@ -38,6 +38,8 @@ RUN pip --no-cache-dir install \
        && \
    python -m ipykernel.kernelspec
 # RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@ -47,6 +47,8 @@ RUN pip --no-cache-dir install \
        && \
    python -m ipykernel.kernelspec
 # RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@ -54,7 +54,7 @@ RUN pip --no-cache-dir install \
    http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@ -1207,7 +1207,7 @@
   "source": [
    "# Training computation: logits + cross-entropy loss.\n",
    "logits = model(train_data_node, True)\n",
-    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\n",
+    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(\n",
    "  labels=train_labels_node, logits=logits))\n",
    "\n",
    "# L2 regularization for the fully connected parameters.\n",
@ -2031,7 +2031,7 @@
   "views": {}
  },
  "kernelspec": {
-   "display_name": "Python [default]",
+   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
@ -2049,5 +2049,5 @@
  }
 },
 "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@ -284,7 +284,7 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
    if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
        sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
        sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
    then
      echo "Modified Dockerfile for python version "\
 "${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
@ -306,7 +306,7 @@ else
        sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
        sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
        sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
    then
      echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
    else
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@ -37,7 +37,7 @@ py_library(
    srcs = ["parser.py"],
    srcs_version = "PY2AND3",
    visibility = ["//visibility:public"],
-    deps = ["@com_github_andreif_codegen"],
+    deps = ["@astor_archive//:astor"],
 )
 py_test(
--- a/tensorflow/tools/docs/build_docs_test.py
+++ b/tensorflow/tools/docs/build_docs_test.py
@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 import os
 import sys
 import textwrap
 import tensorflow as tf
@ -39,10 +38,6 @@ class Flags(object):
 class BuildDocsTest(googletest.TestCase):
  def testBuildDocs(self):
    if sys.version_info >= (3, 0):
      print('Warning: Doc generation is not supported from python3.')
      return
    doc_generator = generate_lib.DocGenerator()
    doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@ -21,7 +21,6 @@ from __future__ import print_function
 import argparse
 import fnmatch
 import os
 import sys
 import six
@ -134,8 +133,12 @@ def write_docs(output_dir, parser_config, yaml_toc, root_title='TensorFlow'):
    try:
      if not os.path.exists(directory):
        os.makedirs(directory)
-      with open(path, 'w') as f:
+      # This function returns raw bytes in PY2 or unicode in PY3.
-        f.write(pretty_docs.build_md_page(page_info))
+      text = pretty_docs.build_md_page(page_info)
      if six.PY3:
        text = text.encode('utf-8')
      with open(path, 'wb') as f:
        f.write(text)
    except OSError as e:
      print('Cannot write documentation for %s to %s: %s' % (full_name,
                                                             directory, e))
@ -437,19 +440,19 @@ def _other_docs(src_dir, output_dir, reference_resolver, file_pattern='*.md'):
      full_out_path = os.path.join(output_dir, suffix)
      if not fnmatch.fnmatch(base_name, file_pattern):
        print('Copying un-matched file %s...' % suffix)
-        open(full_out_path, 'w').write(open(full_in_path).read())
+        open(full_out_path, 'wb').write(open(full_in_path, 'rb').read())
        continue
      if dirpath.endswith('/api_guides/python'):
        print('Processing Python guide %s...' % base_name)
        content = tag_updater.process(full_in_path)
      else:
        print('Processing doc %s...' % suffix)
-        content = open(full_in_path).read()
+        content = open(full_in_path, 'rb').read().decode('utf-8')
      content = reference_resolver.replace_references(content,
                                                      relative_path_to_root)
-      with open(full_out_path, 'w') as f:
+      with open(full_out_path, 'wb') as f:
-        f.write(content)
+        f.write(content.encode('utf-8'))
  print('Done.')
@ -458,8 +461,6 @@ class DocGenerator(object):
  """Main entry point for generating docs."""
  def __init__(self):
    if sys.version_info >= (3, 0):
      sys.exit('Doc generation is not supported from python3.')
    self.argument_parser = argparse.ArgumentParser()
    self._py_modules = None
    self._private_map = _get_default_private_map()
--- a/tensorflow/tools/docs/generate_lib_test.py
+++ b/tensorflow/tools/docs/generate_lib_test.py
@ -52,9 +52,6 @@ class DummyVisitor(object):
 class GenerateTest(googletest.TestCase):
  def test_write(self):
    if sys.version_info >= (3, 0):
      self.skipTest('Warning: Doc generation is not supported from python3.')
    module = sys.modules[__name__]
    index = {
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@ -26,7 +26,7 @@ import os
 import re
 import sys
-import codegen
+import astor
 import six
 from google.protobuf.message import Message as ProtoMessage
@ -621,20 +621,20 @@ def _parse_md_docstring(py_object, relative_path_to_root, reference_resolver):
 def _get_arg_spec(func):
  """Extracts signature information from a function or functools.partial object.
-  For functions, uses `tf_inspect.getargspec`. For `functools.partial` objects,
+  For functions, uses `tf_inspect.getfullargspec`. For `functools.partial`
-  corrects the signature of the underlying function to take into account the
+  objects, corrects the signature of the underlying function to take into
-  removed arguments.
+  account the removed arguments.
  Args:
    func: A function whose signature to extract.
  Returns:
-    An `ArgSpec` namedtuple `(args, varargs, keywords, defaults)`, as returned
+    An `FullArgSpec` namedtuple `(args, varargs, varkw, defaults, etc.)`,
-    by `tf_inspect.getargspec`.
+    as returned by `tf_inspect.getfullargspec`.
  """
-  # getargspec does not work for functools.partial objects directly.
+  # getfullargspec does not work for functools.partial objects directly.
  if isinstance(func, functools.partial):
-    argspec = tf_inspect.getargspec(func.func)
+    argspec = tf_inspect.getfullargspec(func.func)
    # Remove the args from the original function that have been used up.
    first_default_arg = (
        len(argspec.args or []) - len(argspec.defaults or []))
@ -657,12 +657,16 @@ def _get_arg_spec(func):
          argspec_defaults.pop(i-first_default_arg)
        else:
          first_default_arg -= 1
-    return tf_inspect.ArgSpec(args=argspec_args,
+    return tf_inspect.FullArgSpec(
-                              varargs=argspec.varargs,
+        args=argspec_args,
-                              keywords=argspec.keywords,
+        varargs=argspec.varargs,
-                              defaults=tuple(argspec_defaults))
+        varkw=argspec.varkw,
        defaults=tuple(argspec_defaults),
        kwonlyargs=[],
        kwonlydefaults=None,
        annotations={})
  else:  # Regular function or method, getargspec will work fine.
-    return tf_inspect.getargspec(func)
+    return tf_inspect.getfullargspec(func)
 def _remove_first_line_indent(string):
@ -670,11 +674,14 @@ def _remove_first_line_indent(string):
  return '\n'.join([line[indent:] for line in string.split('\n')])
 PAREN_NUMBER_RE = re.compile(r'^\(([0-9.e-]+)\)')
 def _generate_signature(func, reverse_index):
  """Given a function, returns a list of strings representing its args.
  This function produces a list of strings representing the arguments to a
-  python function. It uses tf_inspect.getargspec, which
+  python function. It uses tf_inspect.getfullargspec, which
  does not generalize well to Python 3.x, which is more flexible in how *args
  and **kwargs are handled. This is not a problem in TF, since we have to remain
  compatible to Python 2.7 anyway.
@ -725,7 +732,11 @@ def _generate_signature(func, reverse_index):
      if id(default) in reverse_index:
        default_text = reverse_index[id(default)]
      elif ast_default is not None:
-        default_text = codegen.to_source(ast_default)
+        default_text = (
            astor.to_source(ast_default).rstrip('\n').replace('\t', '\\t')
            .replace('\n', '\\n').replace('"""', "'"))
        default_text = PAREN_NUMBER_RE.sub('\\1', default_text)
        if default_text != repr(default):
          # This may be an internal name. If so, handle the ones we know about.
          # TODO(wicke): This should be replaced with a lookup in the index.
@ -758,8 +769,8 @@ def _generate_signature(func, reverse_index):
  # Add *args and *kwargs.
  if argspec.varargs:
    args_list.append('*' + argspec.varargs)
-  if argspec.keywords:
+  if argspec.varkw:
-    args_list.append('**' + argspec.keywords)
+    args_list.append('**' + argspec.varkw)
  return args_list
@ -1136,9 +1147,11 @@ class _ClassPageInfo(object):
    for short_name in parser_config.tree[self.full_name]:
      # Remove builtin members that we never want to document.
-      if short_name in ['__class__', '__base__', '__weakref__', '__doc__',
+      if short_name in [
-                        '__module__', '__dict__', '__abstractmethods__',
+          '__class__', '__base__', '__weakref__', '__doc__', '__module__',
-                        '__slots__', '__getnewargs__']:
+          '__dict__', '__abstractmethods__', '__slots__', '__getnewargs__',
          '__str__', '__repr__', '__hash__'
      ]:
        continue
      child_name = '.'.join([self.full_name, short_name])
@ -1183,7 +1196,8 @@ class _ClassPageInfo(object):
        # obvious what they do, don't include them in the docs if there's no
        # docstring.
        if not child_doc.brief.strip() and short_name in [
-            '__str__', '__repr__', '__hash__', '__del__', '__copy__']:
+            '__del__', '__copy__'
        ]:
          print('Skipping %s, defined in %s, no docstring.' % (child_name,
                                                               defining_class))
          continue
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@ -398,7 +398,6 @@ class ParserTest(googletest.TestCase):
    self.assertIn('<code>test_function', docs)
  def test_argspec_for_functools_partial(self):
    # pylint: disable=unused-argument
    def test_function_for_partial1(arg1, arg2, kwarg1=1, kwarg2=2):
      pass
@ -409,42 +408,95 @@ class ParserTest(googletest.TestCase):
    # pylint: disable=protected-access
    # Make sure everything works for regular functions.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
+    expected = tf_inspect.FullArgSpec(
-                                  None, (1, 2))
+        args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
        varargs=None,
        varkw=None,
        defaults=(1, 2),
        kwonlyargs=[],
        kwonlydefaults=None,
        annotations={})
    self.assertEqual(expected, parser._get_arg_spec(test_function_for_partial1))
    # Make sure doing nothing works.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
+    expected = tf_inspect.FullArgSpec(
-                                  None, (1, 2))
+        args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
        varargs=None,
        varkw=None,
        defaults=(1, 2),
        kwonlyargs=[],
        kwonlydefaults=None,
        annotations={})
    partial = functools.partial(test_function_for_partial1)
    self.assertEqual(expected, parser._get_arg_spec(partial))
    # Make sure setting args from the front works.
-    expected = tf_inspect.ArgSpec(['arg2', 'kwarg1', 'kwarg2'], None, None,
+    expected = tf_inspect.FullArgSpec(
-                                  (1, 2))
+        args=['arg2', 'kwarg1', 'kwarg2'],
        varargs=None,
        varkw=None,
        defaults=(1, 2),
        kwonlyargs=[],
        kwonlydefaults=None,
        annotations={})
    partial = functools.partial(test_function_for_partial1, 1)
    self.assertEqual(expected, parser._get_arg_spec(partial))
-    expected = tf_inspect.ArgSpec(['kwarg2',], None, None, (2,))
+    expected = tf_inspect.FullArgSpec(
        args=['kwarg2'],
        varargs=None,
        varkw=None,
        defaults=(2,),
        kwonlyargs=[],
        kwonlydefaults=None,
        annotations={})
    partial = functools.partial(test_function_for_partial1, 1, 2, 3)
    self.assertEqual(expected, parser._get_arg_spec(partial))
    # Make sure setting kwargs works.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg2'], None, None, (2,))
+    expected = tf_inspect.FullArgSpec(
        args=['arg1', 'arg2', 'kwarg2'],
        varargs=None,
        varkw=None,
        defaults=(2,),
        kwonlyargs=[],
        kwonlydefaults=None,
        annotations={})
    partial = functools.partial(test_function_for_partial1, kwarg1=0)
    self.assertEqual(expected, parser._get_arg_spec(partial))
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1'], None, None, (1,))
+    expected = tf_inspect.FullArgSpec(
        args=['arg1', 'arg2', 'kwarg1'],
        varargs=None,
        varkw=None,
        defaults=(1,),
        kwonlyargs=[],
        kwonlydefaults=None,
        annotations={})
    partial = functools.partial(test_function_for_partial1, kwarg2=0)
    self.assertEqual(expected, parser._get_arg_spec(partial))
-    expected = tf_inspect.ArgSpec(['arg1'], None, None, ())
+    expected = tf_inspect.FullArgSpec(
        args=['arg1'],
        varargs=None,
        varkw=None,
        defaults=(),
        kwonlyargs=[],
        kwonlydefaults=None,
        annotations={})
    partial = functools.partial(test_function_for_partial1,
                                arg2=0, kwarg1=0, kwarg2=0)
    self.assertEqual(expected, parser._get_arg_spec(partial))
    # Make sure *args, *kwargs is accounted for.
-    expected = tf_inspect.ArgSpec([], 'my_args', 'my_kwargs', ())
+    expected = tf_inspect.FullArgSpec(
        args=[],
        varargs='my_args',
        varkw='my_kwargs',
        defaults=(),
        kwonlyargs=[],
        kwonlydefaults=None,
        annotations={})
    partial = functools.partial(test_function_for_partial2, 0, 1)
    self.assertEqual(expected, parser._get_arg_spec(partial))
@ -524,10 +576,6 @@ class TestParseFunctionDetails(googletest.TestCase):
 class TestGenerateSignature(googletest.TestCase):
  def test_known_object(self):
    if sys.version_info >= (3, 0):
      print('Warning: Doc generation is not supported from python3.')
      return
    known_object = object()
    reverse_index = {id(known_object): 'location.of.object.in.api'}
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@ -101,7 +101,7 @@ def _build_class_page(page_info):
    link_template = '[`{short_name}`]({url})'
    parts.append(', '.join(
-        link_template.format(**base.__dict__) for base in page_info.bases))
+        link_template.format(**base._asdict()) for base in page_info.bases))
  parts.append('\n\n')
@ -159,7 +159,7 @@ def _build_class_page(page_info):
      h3 = ('<h3 id="{short_name}">'
            '<code>{short_name}</code>'
            '</h3>\n\n')
-      parts.append(h3.format(**method_info.__dict__))
+      parts.append(h3.format(**method_info._asdict()))
      if method_info.signature is not None:
        parts.append(_build_signature(method_info, use_full_name=False))
@ -217,7 +217,7 @@ def _build_module_page(page_info):
    template = '[`{short_name}`]({url}) module'
    for item in page_info.modules:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
      if item.doc.brief:
        parts.append(': ' + item.doc.brief)
@ -229,7 +229,7 @@ def _build_module_page(page_info):
    template = '[`class {short_name}`]({url})'
    for item in page_info.classes:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
      if item.doc.brief:
        parts.append(': ' + item.doc.brief)
@ -241,7 +241,7 @@ def _build_module_page(page_info):
    template = '[`{short_name}(...)`]({url})'
    for item in page_info.functions:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
      if item.doc.brief:
        parts.append(': ' + item.doc.brief)
@ -254,7 +254,7 @@ def _build_module_page(page_info):
    parts.append('## Other Members\n\n')
    for item in page_info.other_members:
-      parts.append('`{short_name}`\n\n'.format(**item.__dict__))
+      parts.append('`{short_name}`\n\n'.format(**item._asdict()))
  return ''.join(parts)
--- a/tensorflow/tools/docs/py_guide_parser.py
+++ b/tensorflow/tools/docs/py_guide_parser.py
@ -44,7 +44,7 @@ class PyGuideParser(object):
  def process(self, full_path):
    """Read and process the file at `full_path`."""
-    md_string = open(full_path).read()
+    md_string = open(full_path, 'rb').read().decode('utf-8')
    self._lines = md_string.split('\n')
    seen = set()
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -316,17 +316,6 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
      build_file = clean_dep("//third_party:backports_weakref.BUILD"),
  )
  tf_http_archive(
      name = "com_github_andreif_codegen",
      urls = [
          "https://mirror.bazel.build/github.com/andreif/codegen/archive/1.0.tar.gz",
          "https://github.com/andreif/codegen/archive/1.0.tar.gz",
      ],
      sha256 = "2dadd04a2802de27e0fe5a19b76538f6da9d39ff244036afa00c1bba754de5ee",
      strip_prefix = "codegen-1.0",
      build_file = clean_dep("//third_party:codegen.BUILD"),
  )
  filegroup_external(
      name = "org_python_license",
      licenses = ["notice"],  # Python 2.0