Merge changes from github.

PiperOrigin-RevId: 194997009
2018-05-01 14:28:36 -07:00 · 2018-05-01 14:28:36 -07:00 · 325d0ef21a
commit 325d0ef21a
parent 46bf1e8934
121 changed files with 1809 additions and 724 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,6 +27,7 @@ Podfile.lock
 /tensorflow/contrib/lite/examples/ios/simple/data/*.txt
 /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite
 xcuserdata/**
+/api_init_files_list.txt

 # Android
 .gradle
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@ -1700,7 +1700,7 @@ TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) {
  TestGradientsError(false);
 }

-// REGISTER_OP for CApiTestAttributesTest test cases.
+// REGISTER_OP for CApiAttributesTest test cases.
 // Registers two ops, each with a single attribute called 'v'.
 // The attribute in one op will have a type 'type', the other
 // will have list(type).
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@ -385,6 +385,42 @@ Status MirrorPadGradGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("MirrorPadGrad", MirrorPadGradGrad);

+Status StridedSliceGradHelper(const Scope& scope, const Operation& op,
+                              const std::vector<Output>& grad_inputs,
+                              std::vector<Output>* grad_outputs) {
+  Input x = Shape(scope, op.input(0));
+  Input begin = op.input(1);
+  Input end = op.input(2);
+  Input strides = op.input(3);
+  int64 begin_mask;
+  int64 end_mask;
+  int64 ellipsis_mask;
+  int64 new_axis_mask;
+  int64 shrink_axis_mask;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "begin_mask", &begin_mask));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "end_mask", &end_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "ellipsis_mask", &ellipsis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "new_axis_mask", &new_axis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "shrink_axis_mask", &shrink_axis_mask));
+  grad_outputs->push_back(
+      StridedSliceGrad(scope, x, begin, end, strides, grad_inputs[0],
+                       StridedSliceGrad::BeginMask(begin_mask)
+                           .EndMask(end_mask)
+                           .EllipsisMask(ellipsis_mask)
+                           .NewAxisMask(new_axis_mask)
+                           .ShrinkAxisMask(shrink_axis_mask)));
+  // No gradients returned for begin, end and strides
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("StridedSlice", StridedSliceGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
--- a/tensorflow/cc/gradients/array_grad_test.cc
+++ b/tensorflow/cc/gradients/array_grad_test.cc
@ -354,5 +354,29 @@ TEST_F(ArrayGradTest, MirrorPadGradGrad_Symmetric) {
  RunTest(x, x_shape, y, y_shape);
 }

+TEST_F(ArrayGradTest, StridedSliceGrad) {
+  TensorShape x_shape({6, 4, 4});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+
+  // y = x[2:6:2, 1:3, 1:3]
+  auto y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1});
+  // y.shape = [2, 2, 2];
+  RunTest(x, x_shape, y, {2, 2, 2});
+
+  // y = x[2:6:2, 1:3, 1:3]
+  // begin_mask = 1<<1 (ignore begin_index = 1)
+  // end_mask = 1<<2 (ignore end_index = 2)
+  y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1},
+                   StridedSlice::BeginMask(1 << 1).EndMask(1 << 2));
+  // y.shape = [2, 3, 3];
+  RunTest(x, x_shape, y, {2, 3, 3});
+
+  // y = [tf.newaxis, 2:6:2, 1:3, 1:3]
+  y = StridedSlice(scope_, x, {0, 2, 1, 1}, {0, 6, 3, 3}, {1, 2, 1, 1},
+                   StridedSlice::NewAxisMask(1 << 0));
+  // y.shape = [1, 2, 2, 2];
+  RunTest(x, x_shape, y, {1, 2, 2, 2});
+}
+
 }  // namespace
 }  // namespace tensorflow
--- a/tensorflow/contrib/autograph/README.md
+++ b/tensorflow/contrib/autograph/README.md
@ -56,8 +56,6 @@ Use AutoGraph in one of the following ways, described below:
 1. Annotations (simpler)
 2. Functional API (more flexible)

-NOTE: You can find more examples in this [interactive notebook](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb).
-
 To get started, install the latest nightly TensorFlow build:

 ```shell
@ -70,6 +68,13 @@ Then import the `autograph` module from `tf.contrib`:
 from tensorflow.contrib import autograph as ag
 ```

+### Interactive demo notebooks
+
+For more extensive examples, check out these interactive notebooks:
+
+ * [RNN trained using Keras and Estimators](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb)
+ * [Demo from the TF Dev Summit 2018](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb)
+
 ## Using with annotations

 Annotating a function or class with `@convert` converts it in place:
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -84,7 +84,7 @@ if (NOT WIN32)

  option(systemlib_ALL "Turn on every possible systemlib_* options" OFF)
  if (systemlib_ALL)
-    set (systmelib_ZLIB ON)
+    set (systemlib_ZLIB ON)
  endif (systemlib_ALL)
 endif()

@ -471,6 +471,10 @@ if (tensorflow_ENABLE_GPU)
  include_directories(${tensorflow_source_dir}/third_party/gpus)
  # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+  if(NOT WIN32)
+    # add gomp to tensorflow_EXTERNAL_LIBRARIES, needed by libcusolver.so
+    list(APPEND tensorflow_EXTERNAL_LIBRARIES gomp)
+  endif()

  # NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
  # in the default build is upgraded.
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@ -177,6 +177,16 @@ if(WIN32)
      "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
  )
  list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
+else(WIN32)
+  if(tensorflow_ENABLE_GPU)
+    file(GLOB_RECURSE tf_core_kernels_gpu_exclude_srcs
+        # temporarily disable nccl as it needs to be ported with gpu
+        "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
+    )
+    list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_gpu_exclude_srcs})
+  endif(tensorflow_ENABLE_GPU)
 endif(WIN32)

 file(GLOB_RECURSE tf_core_gpu_kernels_srcs
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@ -64,6 +64,8 @@ file(GLOB tf_stream_executor_srcs
 if (tensorflow_ENABLE_GPU)
    file(GLOB tf_stream_executor_gpu_srcs
        "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
+        "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor.h"
+        "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor.cc"
    )
    if (NOT tensorflow_BUILD_CC_TESTS)
        file(GLOB tf_stream_executor_gpu_tests
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@ -152,6 +152,22 @@ class CrfTest(test.TestCase):

        self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)

+  def testCrfLogNormZeroSeqLength(self):
+    """
+    Test `crf_log_norm` when `sequence_lengths` contains one or more zeros.
+    """
+    with self.test_session() as sess:
+      inputs = constant_op.constant(np.ones([2, 10, 5],
+                                            dtype=np.float32))
+      transition_params = constant_op.constant(np.ones([5, 5],
+                                                       dtype=np.float32))
+      sequence_lengths = constant_op.constant(np.zeros([2],
+                                                       dtype=np.int32))
+      expected_log_norm = np.zeros([2], dtype=np.float32)
+      log_norm = crf.crf_log_norm(inputs, sequence_lengths, transition_params)
+      tf_log_norm = sess.run(log_norm)
+      self.assertAllClose(tf_log_norm, expected_log_norm)
+
  def testCrfLogLikelihood(self):
    inputs = np.array(
        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
@ -292,10 +308,10 @@ class CrfTest(test.TestCase):
                                                       dtype=np.float32))
      sequence_lengths = constant_op.constant(np.zeros([2],
                                                       dtype=np.int32))
-      values = crf.crf_decode(inputs, transition_params, sequence_lengths)
-      tags, scores = sess.run(values)
-      self.assertEqual(len(tags.shape), 2)
-      self.assertEqual(len(scores.shape), 1)
+      tags, scores = crf.crf_decode(inputs, transition_params, sequence_lengths)
+      tf_tags, tf_scores = sess.run([tags, scores])
+      self.assertEqual(len(tf_tags.shape), 2)
+      self.assertEqual(len(tf_scores.shape), 1)

 if __name__ == "__main__":
  test.main()
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@ -90,9 +90,13 @@ def crf_sequence_score(inputs, tag_indices, sequence_lengths,
    batch_size = array_ops.shape(inputs, out_type=tag_indices.dtype)[0]
    example_inds = array_ops.reshape(
        math_ops.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
-    return array_ops.gather_nd(
+    sequence_scores = array_ops.gather_nd(
        array_ops.squeeze(inputs, [1]),
        array_ops.concat([example_inds, tag_indices], axis=1))
+    sequence_scores = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
+                                      array_ops.zeros_like(sequence_scores),
+                                      sequence_scores)
+    return sequence_scores

  def _multi_seq_fn():
    # Compute the scores of the given tag sequence.
@ -128,7 +132,12 @@ def crf_log_norm(inputs, sequence_lengths, transition_params):
  # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
  # the "initial state" (the unary potentials).
  def _single_seq_fn():
-    return math_ops.reduce_logsumexp(first_input, [1])
+    log_norm = math_ops.reduce_logsumexp(first_input, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
+                               array_ops.zeros_like(log_norm),
+                               log_norm)
+    return log_norm

  def _multi_seq_fn():
    """Forward computation of alpha values."""
@ -137,13 +146,19 @@ def crf_log_norm(inputs, sequence_lengths, transition_params):
    # Compute the alpha values in the forward algorithm in order to get the
    # partition function.
    forward_cell = CrfForwardRnnCell(transition_params)
+    # Sequence length is not allowed to be less than zero.
+    sequence_lengths_less_one = math_ops.maximum(0, sequence_lengths - 1)
    _, alphas = rnn.dynamic_rnn(
        cell=forward_cell,
        inputs=rest_of_input,
-        sequence_length=sequence_lengths - 1,
+        sequence_length=sequence_lengths_less_one,
        initial_state=first_input,
        dtype=dtypes.float32)
    log_norm = math_ops.reduce_logsumexp(alphas, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
+                               array_ops.zeros_like(log_norm),
+                               log_norm)
    return log_norm

  max_seq_len = array_ops.shape(inputs)[1]
@ -479,7 +494,7 @@ def crf_decode(potentials, transition_params, sequence_length):
    initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
    initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
    inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
-    # sequence length is not allowed to be less than zero
+    # Sequence length is not allowed to be less than zero.
    sequence_length_less_one = math_ops.maximum(0, sequence_length - 1)
    backpointers, last_score = rnn.dynamic_rnn(  # [B, T - 1, O], [B, O]
        crf_fwd_cell,
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
@ -0,0 +1,109 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops.bijectors.ordered import Ordered
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.platform import test
+
+
+
+class OrderedBijectorTest(test.TestCase):
+  """Tests correctness of the ordered transformation."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijectorVector(self):
+    with self.test_session():
+      ordered = Ordered()
+      self.assertEqual("ordered", ordered.name)
+      x = np.asarray([[2., 3, 4], [4., 8, 13]])
+      y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]]
+      self.assertAllClose(y, self.evaluate(ordered.forward(x)))
+      self.assertAllClose(x, self.evaluate(ordered.inverse(y)))
+      self.assertAllClose(
+          np.sum(np.asarray(y)[..., 1:], axis=-1),
+          self.evaluate(ordered.inverse_log_det_jacobian(y, event_ndims=1)),
+          atol=0.,
+          rtol=1e-7)
+      self.assertAllClose(
+          self.evaluate(-ordered.inverse_log_det_jacobian(y, event_ndims=1)),
+          self.evaluate(ordered.forward_log_det_jacobian(x, event_ndims=1)),
+          atol=0.,
+          rtol=1e-7)
+
+  def testBijectorUnknownShape(self):
+    with self.test_session():
+      ordered = Ordered()
+      self.assertEqual("ordered", ordered.name)
+      x = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
+      real_x = np.asarray([[2., 3, 4], [4., 8, 13]])
+      y = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
+      real_y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]]
+      self.assertAllClose(real_y, ordered.forward(x).eval(
+          feed_dict={x: real_x}))
+      self.assertAllClose(real_x, ordered.inverse(y).eval(
+          feed_dict={y: real_y}))
+      self.assertAllClose(
+          np.sum(np.asarray(real_y)[..., 1:], axis=-1),
+          ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(
+              feed_dict={y: real_y}),
+          atol=0.,
+          rtol=1e-7)
+      self.assertAllClose(
+          -ordered.inverse_log_det_jacobian(y, event_ndims=1).eval(
+              feed_dict={y: real_y}),
+          ordered.forward_log_det_jacobian(x, event_ndims=1).eval(
+              feed_dict={x: real_x}),
+          atol=0.,
+          rtol=1e-7)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testShapeGetters(self):
+    with self.test_session():
+      x = tensor_shape.TensorShape([4])
+      y = tensor_shape.TensorShape([4])
+      bijector = Ordered(validate_args=True)
+      self.assertAllEqual(y, bijector.forward_event_shape(x))
+      self.assertAllEqual(y.as_list(),
+                          self.evaluate(bijector.forward_event_shape_tensor(
+                              x.as_list())))
+      self.assertAllEqual(x, bijector.inverse_event_shape(y))
+      self.assertAllEqual(x.as_list(),
+                          self.evaluate(bijector.inverse_event_shape_tensor(
+                              y.as_list())))
+
+  def testBijectiveAndFinite(self):
+    with self.test_session():
+      ordered = Ordered()
+      x = np.sort(self._rng.randn(3, 10), axis=-1).astype(np.float32)
+      y = (self._rng.randn(3, 10)).astype(np.float32)
+      assert_bijective_and_finite(ordered, x, y, event_ndims=1)
+
+
+if __name__ == "__main__":
+  test.main()
--- a/tensorflow/contrib/distributions/python/ops/bijectors/init.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/init.py
@ -30,6 +30,7 @@
@@Invert
@@Kumaraswamy
@@MaskedAutoregressiveFlow
+@@Ordered
@@Permute
@@PowerTransform
@@RealNVP
@ -67,6 +68,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.inline import *
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
 from tensorflow.contrib.distributions.python.ops.bijectors.kumaraswamy import *
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import *
+from tensorflow.contrib.distributions.python.ops.bijectors.ordered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.permute import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
 from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import *
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@ -170,7 +170,7 @@ class CholeskyOuterProduct(bijector.Bijector):
    sum_weighted_log_diag = array_ops.squeeze(
        math_ops.matmul(math_ops.log(diag),
                        exponents[..., array_ops.newaxis]),
-        squeeze_dims=-1)
+        axis=-1)
    fldj = p_float * np.log(2.) + sum_weighted_log_diag

    return fldj
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
@ -18,14 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector

 __all__ = [
    "Invert",
 ]


-class Invert(bijector_lib.Bijector):
+class Invert(bijector.Bijector):
  """Bijector which inverts another Bijector.

  Example Use: [ExpGammaDistribution (see Background & Context)](
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@ -32,7 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import template as template_ops
 from tensorflow.python.ops import variable_scope as variable_scope_lib
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector


 __all__ = [
@ -42,7 +42,7 @@ __all__ = [
 ]


-class MaskedAutoregressiveFlow(bijector_lib.Bijector):
+class MaskedAutoregressiveFlow(bijector.Bijector):
  """Affine MaskedAutoregressiveFlow bijector for vector-valued events.

  The affine autoregressive flow [(Papamakarios et al., 2016)][3] provides a
--- a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py
@ -0,0 +1,125 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ordered bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "Ordered",
+]
+
+
+class Ordered(bijector.Bijector):
+  """Bijector which maps a tensor x_k that has increasing elements in the last
+  dimension to an unconstrained tensor y_k.
+
+  Both the domain and the codomain of the mapping is [-inf, inf], however,
+  the input of the forward mapping must be strictly increasing.
+  The inverse of the bijector applied to a normal random vector `y ~ N(0, 1)`
+  gives back a sorted random vector with the same distribution `x ~ N(0, 1)`
+  where `x = sort(y)`
+
+  On the last dimension of the tensor, Ordered bijector performs:
+  `y[0] = x[0]`
+  `y[1:] = math_ops.log(x[1:] - x[:-1])`
+
+  #### Example Use:
+
+  ```python
+  bijector.Ordered().forward([2, 3, 4])
+  # Result: [2., 0., 0.]
+
+  bijector.Ordered().inverse([0.06428002, -1.07774478, -0.71530371])
+  # Result: [0.06428002, 0.40464228, 0.8936858]
+  ```
+  """
+
+  def __init__(self, validate_args=False, name="ordered"):
+    super(Ordered, self).__init__(
+        forward_min_event_ndims=1,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward_event_shape(self, input_shape):
+    if input_shape.ndims is None or input_shape[-1] is None:
+      return input_shape
+    return tensor_shape.TensorShape([input_shape[-1]])
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return (input_shape[-1])[..., array_ops.newaxis]
+
+  def _inverse_event_shape(self, output_shape):
+    if output_shape.ndims is None or output_shape[-1] is None:
+      return output_shape
+    if output_shape[-1] <= 1:
+      raise ValueError("output_shape[-1] = %d <= 1" % output_shape[-1])
+    return tensor_shape.TensorShape([output_shape[-1]])
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    if self.validate_args:
+      is_greater_one = check_ops.assert_greater(
+          output_shape[-1], 1, message="Need last dimension greater than 1.")
+      output_shape = control_flow_ops.with_dependencies(
+          [is_greater_one], output_shape)
+    return (output_shape[-1])[..., array_ops.newaxis]
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid_x(x)
+    y0 = x[..., 0, array_ops.newaxis]
+    yk = math_ops.log(x[..., 1:] - x[..., :-1])
+    y = array_ops.concat([y0, yk], axis=-1)
+    return y
+
+  def _inverse(self, y):
+    x0 = y[..., 0, array_ops.newaxis]
+    xk = math_ops.exp(y[..., 1:])
+    x = array_ops.concat([x0, xk], axis=-1)
+    return math_ops.cumsum(x, axis=-1)
+
+  def _inverse_log_det_jacobian(self, y):
+    # The Jacobian of the inverse mapping is lower
+    # triangular, with the diagonal elements being:
+    # J[i,i] = 1 if i=1, and
+    #          exp(y_i) if 1<i<=K
+    # which gives the absolute Jacobian determinant:
+    # |det(Jac)| = prod_{i=1}^{K} exp(y[i]).
+    # (1) - Stan Modeling Language User's Guide and Reference Manual
+    #       Version 2.17.0 session 35.2
+    return math_ops.reduce_sum(y[..., 1:], axis=-1)
+
+  def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid_x(x)
+    return -math_ops.reduce_sum(
+        math_ops.log(x[..., 1:] - x[..., :-1]),
+        axis=-1)
+
+  def _maybe_assert_valid_x(self, x):
+    if not self.validate_args:
+      return x
+    is_valid = check_ops.assert_positive(
+        x[..., 1:] - x[..., :-1],
+        message="Forward transformation input must be strictly increasing.")
+    return control_flow_ops.with_dependencies([is_valid], x)
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
@ -28,7 +28,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector


 __all__ = [
@ -36,7 +36,7 @@ __all__ = [
 ]


-class Permute(bijector_lib.Bijector):
+class Permute(bijector.Bijector):
  """Permutes the rightmost dimension of a `Tensor`.

  ```python
--- a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
@ -25,7 +25,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import template as template_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector


 __all__ = [
@ -34,7 +34,7 @@ __all__ = [
 ]


-class RealNVP(bijector_lib.Bijector):
+class RealNVP(bijector.Bijector):
  """RealNVP "affine coupling layer" for vector-valued events.

  Real NVP models a normalizing flow on a `D`-dimensional distribution via a
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
@ -28,7 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector


 __all__ = [
@ -44,7 +44,7 @@ def _ndims_from_shape(shape):
  return array_ops.shape(shape)[0]


-class Reshape(bijector_lib.Bijector):
+class Reshape(bijector.Bijector):
  """Reshapes the `event_shape` of a `Tensor`.

  The semantics generally follow that of `tf.reshape()`, with
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
@ -128,7 +128,7 @@ class Weibull(bijector.Bijector):
      return x
    is_valid = check_ops.assert_non_negative(
        x,
-        message="Forward transformation input must be at least {}.".format(0))
+        message="Forward transformation input must be at least 0.")
    return control_flow_ops.with_dependencies([is_valid], x)

  def _maybe_assert_valid_y(self, y):
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@ -439,7 +439,7 @@ class _DistributionShape(object):
          if self._batch_ndims_is_0 and expand_batch_dim:
            squeeze_dims += [1]
          if squeeze_dims:
-            x = array_ops.squeeze(x, squeeze_dims=squeeze_dims)
+            x = array_ops.squeeze(x, axis=squeeze_dims)
            # x.shape: [prod(S)]+B+E
        _, batch_shape, event_shape = self.get_shape(x)
      else:
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@ -397,7 +397,7 @@ class GmmAlgorithm(object):
    # Compute the effective number of data points assigned to component k.
    with ops.control_dependencies(self._w):
      points_in_k = array_ops.squeeze(
-          math_ops.add_n(self._points_in_k), squeeze_dims=[0])
+          math_ops.add_n(self._points_in_k), axis=[0])
      # Update alpha.
      if 'w' in self._params:
        final_points_in_k = points_in_k / num_batches
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -932,7 +932,8 @@ def convolution(inputs,
                variables_collections=None,
                outputs_collections=None,
                trainable=True,
-                scope=None):
+                scope=None,
+                conv_dims=None):
  """Adds an N-D convolution followed by an optional batch_norm layer.

  It is required that 1 <= N <= 3.
@ -993,6 +994,10 @@ def convolution(inputs,
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    scope: Optional scope for `variable_scope`.
+    conv_dims: Optional convolution dimensionality, when set it would use the
+      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
+      leaved to None it would select the convolution dimensionality based on
+      the input rank (i.e. Conv ND, with N = input_rank - 2).

  Returns:
    A tensor representing the output of the operation.
@ -1015,6 +1020,9 @@ def convolution(inputs,
    inputs = ops.convert_to_tensor(inputs)
    input_rank = inputs.get_shape().ndims

+    if conv_dims is not None and conv_dims + 2 != input_rank:
+      raise ValueError('Convolution expects input with rank %d, got %d' %
+                       (conv_dims + 2, input_rank))
    if input_rank == 3:
      layer_class = convolutional_layers.Convolution1D
    elif input_rank == 4:
@ -1061,10 +1069,134 @@ def convolution(inputs,
      outputs = activation_fn(outputs)
    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)

+@add_arg_scope
+def convolution1d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=1)

-convolution2d = convolution
-convolution3d = convolution
+convolution1d.__doc__ = convolution.__doc__

+@add_arg_scope
+def convolution2d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=2)
+
+convolution2d.__doc__ = convolution.__doc__
+
+@add_arg_scope
+def convolution3d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=3)
+
+convolution3d.__doc__ = convolution.__doc__

@add_arg_scope
 def convolution2d_in_plane(
@ -1411,7 +1543,7 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
  Args:
     tensor: An `int` `Tensor` to be converted to a `Sparse`.
     eos_token: An integer.
-       It is part of the target label that signfies the end of a sentence.
+       It is part of the target label that signifies the end of a sentence.
     outputs_collections: Collection to add the outputs.
     scope: Optional scope for name_scope.
  """
@ -1555,7 +1687,7 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
    output_collections: Collection to which the outputs will be added.
    scope: Optional scope for `name_scope`.
  Returns:
-    A `Tensor` or `SparseTensor` conataining the same values as `inputs`, but
+    A `Tensor` or `SparseTensor` containing the same values as `inputs`, but
    with innermost dimensions flattened to obtain rank `new_rank`.

  Raises:
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@ -310,6 +310,17 @@ class BiasAddTest(test.TestCase):

 class ConvolutionTest(test.TestCase):

+  def testInvalidShape(self):
+    with self.test_session():
+      images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 5, got 4'):
+        layers_lib.convolution3d(images_2d, 32, 3)
+      images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 4, got 5'):
+        layers_lib.convolution2d(images_3d, 32, 3)
+
  def testInvalidDataFormat(self):
    height, width = 7, 9
    with self.test_session():
@ -3155,7 +3166,7 @@ class RepeatTests(test.TestCase):
    with self.test_session():
      images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
      output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
-      self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu')
      self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])

  def testRepeatWithScope(self):
@ -3749,7 +3760,7 @@ class StackTests(test.TestCase):
          layers_lib.convolution2d, [10, 20, 30],
          kernel_size=[3, 3],
          padding='SAME')
-      self.assertEqual(output.op.name, 'Stack/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu')
      self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30])

  def testStackWithScope(self):
--- a/tensorflow/contrib/layers/python/layers/target_column.py
+++ b/tensorflow/contrib/layers/python/layers/target_column.py
@ -270,7 +270,7 @@ class _RegressionTargetColumn(_TargetColumn):

  def logits_to_predictions(self, logits, proba=False):
    if self.num_label_columns == 1:
-      return array_ops.squeeze(logits, squeeze_dims=[1])
+      return array_ops.squeeze(logits, axis=[1])
    return logits

  def get_eval_ops(self, features, logits, labels, metrics=None):
@ -418,7 +418,7 @@ def _softmax_cross_entropy_loss(logits, target):
                     "Instead got %s." % target.dtype)
  # sparse_softmax_cross_entropy_with_logits requires [batch_size] target.
  if len(target.get_shape()) == 2:
-    target = array_ops.squeeze(target, squeeze_dims=[1])
+    target = array_ops.squeeze(target, axis=[1])
  loss_vec = nn.sparse_softmax_cross_entropy_with_logits(
      labels=target, logits=logits)
  return loss_vec
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@ -777,7 +777,7 @@ class _RegressionHead(_SingleHead):
    key = prediction_key.PredictionKey.SCORES
    with ops.name_scope(None, "predictions", (logits,)):
      if self.logits_dimension == 1:
-        logits = array_ops.squeeze(logits, squeeze_dims=(1,), name=key)
+        logits = array_ops.squeeze(logits, axis=(1,), name=key)
      return {key: self._link_fn(logits)}

  def _metrics(self, eval_loss, predictions, labels, weights):
@ -974,7 +974,7 @@ def _softmax_cross_entropy_loss(labels, logits, weights=None):
    is_squeezed_labels = False
    # TODO(ptucker): This will break for dynamic shapes.
    if len(labels.get_shape()) == 2:
-      labels = array_ops.squeeze(labels, squeeze_dims=(1,))
+      labels = array_ops.squeeze(labels, axis=(1,))
      is_squeezed_labels = True

    loss = nn.sparse_softmax_cross_entropy_with_logits(
--- a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py
@ -40,7 +40,7 @@ def mean_squared_error_regressor(tensor_in, labels, weights, biases, name=None):
                      [tensor_in, labels]):
    predictions = nn.xw_plus_b(tensor_in, weights, biases)
    if len(labels.get_shape()) == 1 and len(predictions.get_shape()) == 2:
-      predictions = array_ops_.squeeze(predictions, squeeze_dims=[1])
+      predictions = array_ops_.squeeze(predictions, axis=[1])
    return predictions, losses.mean_squared_error(labels, predictions)


--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@ -17,6 +17,7 @@ limitations under the License.
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <memory>
 #include <sstream>
@ -70,6 +71,23 @@ TfLiteStatus ReadLabelsFile(const string& file_name,
  return kTfLiteOk;
 }

+void PrintProfilingInfo(const profiling::ProfileEvent* e, uint32_t op_index,
+                        TfLiteRegistration registration) {
+  // output something like
+  // time (ms) , Node xxx, OpCode xxx, symblic name
+  //      5.352, Node   5, OpCode   4, DEPTHWISE_CONV_2D
+
+
+  LOG(INFO) << std::fixed << std::setw(10) << std::setprecision(3)
+            << (e->end_timestamp_us - e->begin_timestamp_us) / 1000.0
+            << ", Node " << std::setw(3) << std::setprecision(3) << op_index
+            << ", OpCode " << std::setw(3) << std::setprecision(3)
+            << registration.builtin_code << ", "
+            << EnumNameBuiltinOperator(
+                   (BuiltinOperator)registration.builtin_code)
+            << "\n";
+}
+
 void RunInference(Settings* s) {
  if (!s->model_name.c_str()) {
    LOG(ERROR) << "no model file name\n";
@ -166,6 +184,11 @@ void RunInference(Settings* s) {
      exit(-1);
  }

+  profiling::Profiler* profiler = new profiling::Profiler();
+  interpreter->SetProfiler(profiler);
+
+  if (s->profiling) profiler->StartProfiling();
+
  struct timeval start_time, stop_time;
  gettimeofday(&start_time, NULL);
  for (int i = 0; i < s->loop_count; i++) {
@ -179,6 +202,18 @@ void RunInference(Settings* s) {
            << (get_us(stop_time) - get_us(start_time)) / (s->loop_count * 1000)
            << " ms \n";

+  if (s->profiling) {
+    profiler->StopProfiling();
+    auto profile_events = profiler->GetProfileEvents();
+    for (int i = 0; i < profile_events.size(); i++) {
+      auto op_index = profile_events[i]->event_metadata;
+      const auto node_and_registration =
+          interpreter->node_and_registration(op_index);
+      const TfLiteRegistration registration = node_and_registration->second;
+      PrintProfilingInfo(profile_events[i], op_index, registration);
+    }
+  }
+
  const int output_size = 1000;
  const size_t num_results = 5;
  const float threshold = 0.001f;
@ -217,13 +252,14 @@ void RunInference(Settings* s) {

 void display_usage() {
  LOG(INFO) << "label_image\n"
-            << "--accelerated, -a: [0|1], use Android NNAPI or note\n"
+            << "--accelerated, -a: [0|1], use Android NNAPI or not\n"
            << "--count, -c: loop interpreter->Invoke() for certain times\n"
            << "--input_mean, -b: input mean\n"
            << "--input_std, -s: input standard deviation\n"
            << "--image, -i: image_name.bmp\n"
            << "--labels, -l: labels for the model\n"
            << "--tflite_model, -m: model_name.tflite\n"
+            << "--profiling, -p: [0|1], profiling or not\n"
            << "--threads, -t: number of threads\n"
            << "--verbose, -v: [0|1] print more information\n"
            << "\n";
@ -241,6 +277,7 @@ int Main(int argc, char** argv) {
        {"image", required_argument, 0, 'i'},
        {"labels", required_argument, 0, 'l'},
        {"tflite_model", required_argument, 0, 'm'},
+        {"profiling", required_argument, 0, 'p'},
        {"threads", required_argument, 0, 't'},
        {"input_mean", required_argument, 0, 'b'},
        {"input_std", required_argument, 0, 's'},
@ -249,7 +286,7 @@ int Main(int argc, char** argv) {
    /* getopt_long stores the option index here. */
    int option_index = 0;

-    c = getopt_long(argc, argv, "a:b:c:f:i:l:m:s:t:v:", long_options,
+    c = getopt_long(argc, argv, "a:b:c:f:i:l:m:p:s:t:v:", long_options,
                    &option_index);

    /* Detect the end of the options. */
@ -276,6 +313,10 @@ int Main(int argc, char** argv) {
      case 'm':
        s.model_name = optarg;
        break;
+      case 'p':
+        s.profiling = strtol(  // NOLINT(runtime/deprecated_fn)
+            optarg, (char**)NULL, 10);
+        break;
      case 's':
        s.input_std = strtod(optarg, NULL);
        break;
--- a/tensorflow/contrib/lite/examples/label_image/label_image.h
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.h
@ -25,6 +25,7 @@ struct Settings {
  bool verbose = false;
  bool accel = false;
  bool input_floating = false;
+  bool profiling = false;
  int loop_count = 1;
  float input_mean = 127.5f;
  float input_std = 127.5f;
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@ -84,4 +84,32 @@
            android:visibility="visible" />
    </RelativeLayout>

+    <RelativeLayout
+        android:id="@+id/control2"
+        android:layout_width="match_parent"
+        android:layout_height="135dp"
+        android:layout_alignParentLeft="true"
+        android:layout_alignParentStart="true"
+        android:layout_alignTop="@+id/control"
+        android:layout_marginLeft="300dp"
+        android:layout_marginStart="300dp"
+        android:background="@color/control_background">
+
+        <ToggleButton
+            android:id="@+id/button"
+            android:textOff="@string/tflite"
+            android:textOn="@string/nnapi"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_alignParentLeft="true"
+            android:layout_alignParentStart="true" />
+
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_below="@+id/button"
+            android:visibility="visible" />
+    </RelativeLayout>
+
 </RelativeLayout>
--- a/tensorflow/contrib/lite/kernels/topk_v2.cc
+++ b/tensorflow/contrib/lite/kernels/topk_v2.cc
@ -25,8 +25,8 @@ namespace builtin {
 namespace topk_v2 {
 constexpr int kInputTensor = 0;
 constexpr int kInputTopK = 1;
-constexpr int kOutputIndexes = 0;
-constexpr int kOutputValues = 1;
+constexpr int kOutputValues = 0;
+constexpr int kOutputIndexes = 1;

 namespace {
 TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
--- a/tensorflow/contrib/lite/kernels/topk_v2_test.cc
+++ b/tensorflow/contrib/lite/kernels/topk_v2_test.cc
@ -31,8 +31,8 @@ class TopKV2OpModel : public SingleOpModel {
                int top_k) {
    input_ = AddInput(input_type);
    top_k_ = AddInput(TensorType_INT32);
-    output_indexes_ = AddOutput(TensorType_INT32);
    output_values_ = AddOutput(input_type);
+    output_indexes_ = AddOutput(TensorType_INT32);
    SetBuiltinOp(BuiltinOperator_TOPK_V2, BuiltinOptions_TopKV2Options, 0);
    BuildInterpreter({input_shape, {1}});
    PopulateTensor<int32_t>(top_k_, {top_k});
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@ -609,7 +609,7 @@ enum {
   * Long short-term memory unit (LSTM) recurrent network layer.
   *
   * The default non-peephole implementation is based on:
-   * http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+   * http://www.bioinf.jku.at/publications/older/2604.pdf
   * S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural
   * Computation, 9(8):1735-1780, 1997.
   *
--- a/tensorflow/contrib/lite/profiling/profile_buffer.h
+++ b/tensorflow/contrib/lite/profiling/profile_buffer.h
@ -37,9 +37,9 @@ struct ProfileEvent {
  // Label of the event. This usually describes the event.
  const char* tag;
  // Timestamp in microseconds when the event began.
-  int64_t begin_timestamp_us;
+  uint64_t begin_timestamp_us;
  // Timestamp in microseconds when the event ended.
-  int64_t end_timestamp_us;
+  uint64_t end_timestamp_us;
  // The field containing the type of event. This must be one of the event types
  // in EventType.
  EventType event_type;
@ -74,7 +74,7 @@ class ProfileBuffer {
    if (!enabled_) {
      return kInvalidEventHandle;
    }
-    int64_t timestamp = NowMicros();
+    uint64_t timestamp = NowMicros();
    int index = current_index_ % event_buffer_.size();
    event_buffer_[index].tag = tag;
    event_buffer_[index].event_type = event_type;
@ -134,7 +134,7 @@ class ProfileBuffer {
  }

 private:
-  static int64_t NowMicros() {
+  static uint64_t NowMicros() {
    // TODO(shashishekhar): Refactor this to a separate file.
    struct timeval tv;
    gettimeofday(&tv, nullptr);
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@ -124,6 +124,15 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
      SetDataTypeForAllOutputs(model, op, rand_op->dtype);
      break;
    }
+    case OperatorType::kTopK_V2: {
+      // topk(values: T, k: int32) -> values: T, indices: int32
+      CHECK_EQ(op->inputs.size(), 2);
+      CHECK_EQ(op->outputs.size(), 2);
+      CHECK(model->GetArray(op->inputs[1]).data_type == ArrayDataType::kInt32);
+      model->GetArray(op->outputs[0]).data_type = model->GetArray(op->inputs[0]).data_type;
+      model->GetArray(op->outputs[1]).data_type = ArrayDataType ::kInt32;
+      break;
+    }
    case OperatorType::kTensorFlowUnsupported: {
      auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
      // Some output tensors from the op could be eliminated by optimization.
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@ -1087,8 +1087,8 @@ void ProcessGatherOperator(Model* model, GatherOperator* op) {
 void ProcessTopkV2Operator(Model* model, TopKV2Operator* op) {
  const auto& input_values = model->GetArray(op->inputs[0]);
  const auto& input_k = model->GetArray(op->inputs[1]);
-  auto& output_indexes = model->GetArray(op->outputs[0]);
-  auto& output_values = model->GetArray(op->outputs[1]);
+  auto& output_values = model->GetArray(op->outputs[0]);
+  auto& output_indexes = model->GetArray(op->outputs[1]);

  // Bail if we already know the output shape.
  if (output_indexes.has_shape()) {
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@ -1991,7 +1991,7 @@ void ConvertTopKV2Operator(const NodeDef& node,
    op->inputs.push_back(node.input(1));
  }
  // The op has two outputs.
-  op->outputs.push_back(node.name() + ":0");
+  op->outputs.push_back(node.name());
  op->outputs.push_back(node.name() + ":1");
  model->operators.emplace_back(op.release());
 }
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@ -825,11 +825,6 @@ void FixNoOrphanedArray(Model* model) {
 void CheckEachArray(const Model& model) {
  for (const auto& array_entry : model.GetArrayMap()) {
    const auto& array = array_entry.second;
-    if (array->has_shape()) {
-      for (int d : array->shape().dims()) {
-        CHECK_GE(d, 1);
-      }
-    }
    // It's OK to have a buffer or an alloc, but not both.
    // (Since allocs are for transient arrays without a buffer).
    CHECK(!array->buffer || !array->alloc);
@ -839,6 +834,10 @@ void CheckEachArray(const Model& model) {
      // The presence of a fixed buffer should imply the presence of a fixed
      // shape.
      CHECK(array->has_shape());
+      // Constant buffer should has a valid shape.
+      for (int d : array->shape().dims()) {
+        CHECK_GE(d, 1);
+      }
      // The shape flat-size should agree with the buffer length.
      CHECK_EQ(array->buffer->Length(),
               RequiredBufferSizeForShape(array->shape()));
--- a/tensorflow/contrib/mpi/mpi_utils.h
+++ b/tensorflow/contrib/mpi/mpi_utils.h
@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>

+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/lib/strings/str_util.h"

 // Skip MPI C++ bindings support, this matches the usage in other places
--- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
@ -56,21 +56,21 @@ class LazyAdamOptimizer(adam.AdamOptimizer):
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

-    # m := beta1 * m + (1 - beta1) * g_t
+    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
    m = self.get_slot(var, "m")
    m_t = state_ops.scatter_update(m, grad.indices,
                                   beta1_t * array_ops.gather(m, grad.indices) +
                                   (1 - beta1_t) * grad.values,
                                   use_locking=self._use_locking)

-    # v := beta2 * v + (1 - beta2) * (g_t * g_t)
+    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
    v = self.get_slot(var, "v")
    v_t = state_ops.scatter_update(v, grad.indices,
                                   beta2_t * array_ops.gather(v, grad.indices) +
                                   (1 - beta2_t) * math_ops.square(grad.values),
                                   use_locking=self._use_locking)

-    # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))
+    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
    m_t_slice = array_ops.gather(m_t, grad.indices)
    v_t_slice = array_ops.gather(v_t, grad.indices)
    denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@ -40,23 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):

    Initialization:

-    ```
-    m_0 <- 0 (Initialize initial 1st moment vector)
-    v_0 <- 0 (Initialize initial 2nd moment vector)
-    t <- 0 (Initialize timestep)
-    ```
+    $$m_0 := 0 (Initialize initial 1st moment vector)$$
+    $$v_0 := 0 (Initialize initial 2nd moment vector)$$
+    $$t := 0 (Initialize timestep)$$

    The update rule for `variable` with gradient `g` uses an optimization
    described at the end of section2 of the paper:

-    ```
-    t <- t + 1
-    lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+    $$t := t + 1$$
+    $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$

-    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-    v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
-    variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
-    ```
+    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$

    The default value of 1e-8 for epsilon might not be a good default in
    general. For example, when training an Inception network on ImageNet a
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@ -307,6 +307,21 @@ class LSTMTest(test.TestCase):
    self._seed = 23489
    np.random.seed(self._seed)

+  def testDType(self):
+    # Test case for GitHub issue 16228
+    # Not passing dtype in constructor results in default float32
+    lstm = rnn_cell.LSTMCell(10)
+    input_tensor = array_ops.ones([10, 50])
+    lstm.build(input_tensor.get_shape())
+    self.assertEqual(lstm._bias.dtype, dtypes.float32_ref)
+
+    # Explicitly pass dtype in constructor
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
+      lstm = rnn_cell.LSTMCell(10, dtype=dtype)
+      input_tensor = array_ops.ones([10, 50])
+      lstm.build(input_tensor.get_shape())
+      self.assertEqual(lstm._bias.dtype, dtype._as_ref)
+
  def testNoProjNoSharding(self):
    num_units = 3
    input_size = 5
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@ -37,7 +37,7 @@ def _top_k_generator(k):
  def _top_k(probabilities, targets):
    targets = math_ops.to_int32(targets)
    if targets.get_shape().ndims > 1:
-      targets = array_ops.squeeze(targets, squeeze_dims=[1])
+      targets = array_ops.squeeze(targets, axis=[1])
    return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
  return _top_k

@ -57,7 +57,7 @@ def _r2(probabilities, targets, weights=None):


 def _squeeze_and_onehot(targets, depth):
-  targets = array_ops.squeeze(targets, squeeze_dims=[1])
+  targets = array_ops.squeeze(targets, axis=[1])
  return array_ops.one_hot(math_ops.to_int32(targets), depth)


--- a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py
@ -55,7 +55,7 @@ class ManyToOneLayer(hybrid_layer.HybridLayer):

      # There is always one activation per instance by definition, so squeeze
      # away the extra dimension.
-      return array_ops.squeeze(nn_activations, squeeze_dims=[1])
+      return array_ops.squeeze(nn_activations, axis=[1])


 class FlattenedFullyConnectedLayer(hybrid_layer.HybridLayer):
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@ -445,7 +445,7 @@ class RandomForestGraphs(object):
          mask = math_ops.less(
              r, array_ops.ones_like(r) * self.params.bagging_fraction)
          gather_indices = array_ops.squeeze(
-              array_ops.where(mask), squeeze_dims=[1])
+              array_ops.where(mask), axis=[1])
          # TODO(thomaswc): Calculate out-of-bag data and labels, and store
          # them for use in calculating statistics later.
          tree_data = array_ops.gather(processed_dense_features, gather_indices)
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@ -111,20 +111,22 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
  }
 }

-std::pair<string, int> ParseTensorName(string name, int default_idx = 0) {
+std::pair<string, int> ParseTensorName(const string& name,
+                                       int default_idx = 0) {
+  string name_no_idx = name;
  int idx = default_idx;
-  size_t sep = name.find_last_of(':');
+  const size_t sep = name_no_idx.find_last_of(':');
  if (sep != string::npos) {
-    name = name.substr(0, sep);
+    name_no_idx = name_no_idx.substr(0, sep);
    idx = std::stoi(name.substr(sep + 1));
  }
-  return std::make_pair(name, idx);
+  return std::make_pair(name_no_idx, idx);
 }

 std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
    const std::vector<string>& tensor_names) {
  std::unordered_map<string, std::vector<int>> result;
-  for (string const& tensor_name : tensor_names) {
+  for (const string& tensor_name : tensor_names) {
    string node_name;
    int index;
    std::tie(node_name, index) = ParseTensorName(tensor_name);
@ -132,6 +134,7 @@ std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
  }
  return result;
 }
+
 // TODO(sami): convert references to pointers
 struct ConvertGraphParams {
  ConvertGraphParams(
--- a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py
@ -78,7 +78,7 @@ class StubTimeSeriesModel(model.TimeSeriesModel):
    batch_end_values = array_ops.squeeze(
        array_ops.slice(values, [0, array_ops.shape(times)[1] - 1, 0],
                        [-1, 1, -1]),
-        squeeze_dims=[1, 2])
+        axis=[1, 2])
    # A pretty odd but easy to think about loss: L1 loss on the batch end
    # values.
    loss = math_ops.reduce_sum(
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
@ -170,7 +170,7 @@ class KalmanFilter(object):
        math_ops.matmul(
            transition_matrices,
            prior_state[..., None]),
-        squeeze_dims=[-1])
+        axis=[-1])
    return advanced_state

  def predict_state_var(
@ -254,7 +254,7 @@ class KalmanFilter(object):
            kalman_gain_transposed,
            array_ops.expand_dims(residual, -1),
            adjoint_a=True),
-        squeeze_dims=[-1])
+        axis=[-1])
    gain_obs = math_ops.matmul(
        kalman_gain_transposed, observation_model, adjoint_a=True)
    identity_extradim = linalg_ops.eye(
@ -332,7 +332,7 @@ class KalmanFilter(object):
            array_ops.expand_dims(state_mean, 1),
            observation_model,
            adjoint_b=True),
-        squeeze_dims=[1])
+        axis=[1])
    observed_var = math_ops.matmul(
        math_ops.matmul(observation_model, state_var),
        observation_model,
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -2292,7 +2292,9 @@ tf_cuda_library(

 CORE_CPU_BASE_HDRS = GRAPH_HDRS + [
    "common_runtime/device.h",
+    "common_runtime/device_factory.h",
    "common_runtime/device_mgr.h",
+    "common_runtime/device_set.h",
    "common_runtime/eval_const_tensor.h",
    "common_runtime/graph_runner.h",
    "common_runtime/shape_refiner.h",
@ -2350,9 +2352,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
    "common_runtime/copy_tensor.h",
    "common_runtime/costmodel_manager.h",
    "common_runtime/debugger_state_interface.h",
-    "common_runtime/device_factory.h",
    "common_runtime/device_resolver_local.h",
-    "common_runtime/device_set.h",
    "common_runtime/dma_helper.h",
    "common_runtime/eigen_thread_pool.h",
    "common_runtime/executor.h",
--- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@ -82,9 +82,9 @@ END
  }
  summary: "Update \'*var\' according to the Adam algorithm."
  description: <<END
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+$$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
@ -24,5 +24,6 @@ pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
                      [0, 0, 2, 2, 0, 0]
                      [0, 0, 0, 0, 0, 0]]
 ```
+
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
@ -44,6 +44,7 @@ In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
 if T == qint8, out[i] -= (range(T) + 1) / 2.0
 ```
+
 here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`

 *MIN_COMBINED Mode Example*
@ -87,6 +88,7 @@ choosing to elide the lowest possible value for symmetry (e.g., output range is

 We first find the range of values in our tensor. The
 range we use is always centered on 0, so we find m such that
+
 ```c++
  m = max(abs(input_min), abs(input_max))
 ```
@ -95,6 +97,7 @@ Our input tensor range is then `[-m, m]`.

 Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
 If T is signed, this is
+
 ```
  num_bits = sizeof(T) * 8
  [min_fixed, max_fixed] =
@ -102,16 +105,19 @@ If T is signed, this is
 ```

 Otherwise, if T is unsigned, the fixed-point range is
+
 ```
  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
 ```

 From this we compute our scaling factor, s:
+
 ```c++
  s = (max_fixed - min_fixed) / (2 * m)
 ```

 Now we can quantize the elements of our tensor:
+
 ```c++
 result = round(input * s)
 ```
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@ -76,9 +76,9 @@ END
  }
  summary: "Update \'*var\' according to the Adam algorithm."
  description: <<END
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+$$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@ -25,12 +25,12 @@ A new tensor with the given shape and updates applied according
 to the indices.
 END
  }
-  summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
+  summary: "Scatter `updates` into a new tensor according to `indices`."
  description: <<END
-Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` according to
-indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-extracts values or slices from a given tensor.
+Creates a new tensor by applying sparse `updates` to individual values or
+slices within a tensor (initially zero for numeric, empty for string) of
+the given `shape` according to indices.  This operator is the inverse of the
+@{tf.gather_nd} operator which extracts values or slices from a given tensor.

 **WARNING**: The order in which updates are applied is nondeterministic, so the
 output will be nondeterministic if `indices` contains duplicates.
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@ -490,7 +490,7 @@ Status GraphExecutionState::OptimizeGraph(
        cpu_device = device;
      }
    }
-    grappler::VirtualCluster cluster(device_map);
+    grappler::VirtualCluster cluster(device_map, device_set_);
    GraphDef new_graph;
    TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
        item, rewrite_options, cpu_device, &cluster, &new_graph));
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@ -547,14 +547,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {

    // If Op has been specifically assigned to a non-CPU device, then No.
    if (!n->assigned_device_name().empty() &&
-        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
      result = false;
      reason = "Op has been assigned a runtime device that is not CPU.";
    }

    // If user has specifically assigned this op to a non-CPU device, then No.
    if (!n->def().device().empty() &&
-        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
      result = false;
      reason = "User has assigned a device that is not CPU.";
    }
@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {

    // If Op has been specifically assigned to a non-CPU device, then No.
    if (!n->assigned_device_name().empty() &&
-        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
      result = false;
      reason = "Op has been assigned a runtime device that is not CPU.";
    }

    // If user has specifically assigned this op to a non-CPU device, then No.
    if (!n->def().device().empty() &&
-        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
      result = false;
      reason = "User has assigned a device that is not CPU.";
    }
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@ -56,6 +56,7 @@ cc_library(
    ],
    visibility = ["//visibility:public"],
    deps = [
+        "//tensorflow/core:core_cpu_base",
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
        "//tensorflow/core:protos_all_cc",
@ -73,6 +74,7 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        ":cluster",
+        "//tensorflow/core:core_cpu_base",
        "//tensorflow/core:framework",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/grappler/costs:op_level_cost_estimator",
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>

+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
@ -92,6 +93,10 @@ class Cluster {
  // sorted alphabetically.
  const std::vector<string> GetDeviceNames() const;

+  // The DeviceSet is not always available, but when it is it contains a
+  // superset of the devices listed in GetDevices/GetDeviceNames().
+  const DeviceSet* GetDeviceSet() const { return device_set_; }
+
  // Enables collecting the allocator stats. Call with enable=true must be made
  // before Provision().
  virtual Status EnablePeakMemoryStats(bool enable) {
@ -119,6 +124,7 @@ class Cluster {

 protected:
  std::unordered_map<string, DeviceProperties> devices_;
+  const DeviceSet* device_set_ = nullptr;  // Not owned
  const int timeout_s_;
  SessionOptions options_;
  RunOptions run_options_;
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@ -37,6 +37,14 @@ VirtualCluster::VirtualCluster(
    : Cluster(0), node_estimator_(node_estimator), node_manager_(node_manager) {
  devices_ = devices;
 }
+
+VirtualCluster::VirtualCluster(
+    const std::unordered_map<string, DeviceProperties>& devices,
+    const DeviceSet* device_set)
+    : VirtualCluster(devices) {
+  device_set_ = device_set;
+}
+
 VirtualCluster::~VirtualCluster() {}

 Status VirtualCluster::Provision() { return Status::OK(); }
--- a/tensorflow/core/grappler/clusters/virtual_cluster.h
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_

 #include <unordered_map>
+
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
@ -34,6 +36,8 @@ class VirtualCluster : public Cluster {
  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
                 OpLevelCostEstimator* node_estimator,
                 ReadyNodeManager* node_manager);
+  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
+                 const DeviceSet* device_set);

  ~VirtualCluster() override;

--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@ -199,7 +199,7 @@ class FirstReadyManager : public ReadyNodeManager {
  // current node.
  std::vector<const NodeDef*> nodes_;
  // Newly added nodes are added to waiting_queue_. That way, GetCurrNode(),
-  // wihch returns the front of the nodes_, always returns the same node,
+  // which returns the front of the nodes_, always returns the same node,
  // even if any of new nodes has time_ready smaller than the current node's.
  std::vector<const NodeDef*> waiting_queue_;
  // Comparator functor for heap; stl heap is max heap, so we use "greater than"
@ -212,7 +212,7 @@ class FirstReadyManager : public ReadyNodeManager {
 };

 // CompositeNodeManager has a few other NodeManagers: per-device LIFO for normal
-// ops (neither _Send nor _Recv) and FirstyReadyManagers for _Send ops and _Recv
+// ops (neither _Send nor _Recv) and FirstReadyManagers for _Send ops and _Recv
 // ops, and then it chooses FirstReady among the ops chosen from each
 // internal NodeManagers. The objective is to maximize producer-consumer
 // locality within device, while processing nodes across devices, including
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
@ -33,7 +33,7 @@ class CustomGraphOptimizerRegistry {
  static std::vector<string> GetRegisteredOptimizers();

  typedef std::function<CustomGraphOptimizer*()> Creator;
-  // Regsiter graph optimizer which can be called during program initialization.
+  // Register graph optimizer which can be called during program initialization.
  // This class is not thread-safe.
  static void RegisterOptimizerOrDie(const Creator& optimizer_creator,
                                     const string& name);
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@ -160,13 +160,26 @@ Status MetaOptimizer::InitializeOptimizersByName(
      VLOG(2) << "Can't register an optimizer by name: " << optimizer_name;
    }
  }
+  for (const auto& optimizer_config : cfg_.custom_optimizers()) {
+    auto custom_optimizer = CustomGraphOptimizerRegistry::CreateByNameOrNull(
+        optimizer_config.name());
+    if (custom_optimizer) {
+      VLOG(2) << "Registered custom configurable graph optimizer: "
+              << optimizer_config.name();
+      TF_RETURN_IF_ERROR(custom_optimizer->Init(&optimizer_config));
+      optimizers->push_back(std::move(custom_optimizer));
+    } else {
+      VLOG(2) << "Can't register an optimizer by name: "
+              << optimizer_config.name();
+    }
+  }
  return Status::OK();
 }

 Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
-  if (cfg_.optimizers().empty()) {
+  if (cfg_.optimizers().empty() && cfg_.custom_optimizers().empty()) {
    TF_RETURN_IF_ERROR(InitializeOptimizers(&optimizers));
  } else {
    TF_RETURN_IF_ERROR(InitializeOptimizersByName(&optimizers));
@ -337,7 +350,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
         cfg.auto_parallel().enable() ||
         cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
         cfg.debug_stripper() == RewriterConfig::ON ||
-         !cfg.optimizers().empty();
+         !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
 }

 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/kernels/batch_util.cc
@ -134,6 +134,8 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
  switch (element.dtype()) {
    TF_CALL_ALL_TYPES(HANDLE_TYPE);
    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
    default:
      return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
-          int16, int32, int64);
+REGISTER6(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
+          int8, int16, int32, int64);
 REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
          Eigen::half, double);

--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@ -20,6 +20,7 @@ limitations under the License.
 #include <map>
 #include <string>
 #include <vector>
+#include <memory>

 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@ -42,14 +43,13 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"

 #ifndef INTEL_MKL_ML
-
 #include "mkldnn.hpp"

 using mkldnn::prop_kind;
 using mkldnn::stream;
-
-using mkldnn::convolution_direct;
 using mkldnn::convolution_forward;
+using mkldnn::convolution_direct;
+
 #else
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@ -57,11 +57,232 @@ using mkldnn::convolution_forward;

 namespace tensorflow {

+#ifndef INTEL_MKL_ML
+
+struct ConvFwdDimensions {
+  memory::dims src_dims;
+  memory::dims filter_dims;
+  memory::dims bias_dims;
+  memory::dims dst_dims;
+  memory::dims strides;
+  memory::dims dilations;
+  memory::dims padding_left;
+  memory::dims padding_right;
+
+  ConvFwdDimensions(memory::dims src_dims,
+    memory::dims filter_dims, memory::dims bias_dims,
+    memory::dims dst_dims, memory::dims strides,
+    memory::dims dilations, memory::dims padding_left,
+    memory::dims padding_right) :
+      src_dims(src_dims), filter_dims(filter_dims),
+      bias_dims(bias_dims), dst_dims(dst_dims),
+      strides(strides), dilations(dilations),
+      padding_left(padding_left), padding_right(padding_right) {
+  }
+};
+
+template <typename T>
+class Conv2DFwd : public DnnOp {
+ public:
+  explicit Conv2DFwd(const ConvFwdDimensions& convFwdDims) {
+    fwd_stream_.reset(new stream(stream::kind::eager));
+    // create conv primitive
+    if (conv_fwd_ == nullptr) {
+      Setup(convFwdDims);
+    }
+  }
+
+  ~Conv2DFwd() {}
+
+  // Convolution forward execute with bias
+  //   src_data:    input data buffer of src
+  //   filter_data: input data buffer of filter (weights)
+  //   bias_data:   input data buffer of bias
+  //   dst_data:    output data buffer of dst
+  void Execute(T* src_data, T* filter_data, T* bias_data, T* dst_data) {
+    src_mem_->set_data_handle(static_cast<void*>(src_data));
+    filter_mem_->set_data_handle(static_cast<void*>(filter_data));
+    bias_mem_->set_data_handle(static_cast<void*>(bias_data));
+    dst_mem_->set_data_handle(static_cast<void*>(dst_data));
+    fwd_stream_->submit(fwd_primitives_);
+
+    // after exec, set data handle back
+    src_mem_->set_data_handle(DummyData);
+    filter_mem_->set_data_handle(DummyData);
+    bias_mem_->set_data_handle(DummyData);
+    dst_mem_->set_data_handle(DummyData);
+
+    return;
+  }
+
+  // Convolution forward execute without bias
+  //   src_data:    input data buffer of src
+  //   filter_data: input data buffer of filter (weights)
+  //   dst_data:    output data buffer of dst
+  void Execute(T* src_data, T* filter_data, T* dst_data) {
+    src_mem_->set_data_handle(static_cast<void*>(src_data));
+    filter_mem_->set_data_handle(static_cast<void*>(filter_data));
+    dst_mem_->set_data_handle(static_cast<void*>(dst_data));
+    fwd_stream_->submit(fwd_primitives_);
+
+    // after exec, set data handle back
+    src_mem_->set_data_handle(DummyData);
+    filter_mem_->set_data_handle(DummyData);
+    dst_mem_->set_data_handle(DummyData);
+
+    return;
+  }
+
+  // expected memory format for this primitive instance
+  memory::format src_fmt_;
+  memory::format filter_fmt_;
+
+  // convolution primitive
+  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd_;
+  std::shared_ptr<mkldnn::primitive> conv_fwd_;
+
+ private:
+  void Setup(const ConvFwdDimensions& convFwdDims) {
+    // create memory descriptors for convolution data w/ no specified format
+    src_md_.reset(new memory::desc({convFwdDims.src_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    filter_md_.reset(new memory::desc({convFwdDims.filter_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    dst_md_.reset(new memory::desc({convFwdDims.dst_dims},
+        MklDnnType<T>(), memory::format::any));
+
+    if (!convFwdDims.bias_dims.empty())
+        bias_md_.reset(new memory::desc({convFwdDims.bias_dims},
+            MklDnnType<T>(), memory::format::any));
+
+    // create a convolution
+    if (!convFwdDims.bias_dims.empty()) {
+      fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+          convolution_direct, *src_md_, *filter_md_, *bias_md_, *dst_md_,
+          convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+          convFwdDims.padding_right, padding_kind::zero));
+    } else {
+      fwd_desc_.reset(new convolution_forward::desc(prop_kind::forward,
+          convolution_direct, *src_md_, *filter_md_, *dst_md_,
+          convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left,
+          convFwdDims.padding_right, padding_kind::zero));
+    }
+
+    fwd_pd_.reset(new convolution_forward::primitive_desc(
+        *fwd_desc_, cpu_engine_));
+
+    // store the expected memory format
+    src_fmt_ = static_cast<mkldnn::memory::format>(
+        fwd_pd_.get()->src_primitive_desc().desc().data.format);
+
+    filter_fmt_ = static_cast<mkldnn::memory::format>(
+        fwd_pd_.get()->weights_primitive_desc().desc().data.format);
+
+    // create memory primitive based on dummy data
+    src_mem_.reset(new memory(fwd_pd_.get()->src_primitive_desc(), DummyData));
+    filter_mem_.reset(new memory(fwd_pd_.get()->weights_primitive_desc(),
+                      DummyData));
+    dst_mem_.reset(new memory(fwd_pd_.get()->dst_primitive_desc(), DummyData));
+
+    // create convolution primitive and add it to net
+    if (!convFwdDims.bias_dims.empty()) {
+        bias_mem_.reset(new memory({{{convFwdDims.bias_dims}, MklDnnType<T>(),
+                        memory::format::x}, cpu_engine_}, DummyData));
+        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
+                        *filter_mem_, *bias_mem_, *dst_mem_));
+    } else {
+        conv_fwd_.reset(new convolution_forward(*fwd_pd_, *src_mem_,
+                        *filter_mem_, *dst_mem_));
+    }
+
+    fwd_primitives_.push_back(*conv_fwd_);
+    return;
+  }
+
+  // MKLDNN memory
+  std::shared_ptr<mkldnn::memory> src_mem_;
+  std::shared_ptr<mkldnn::memory> filter_mem_;
+  std::shared_ptr<mkldnn::memory> bias_mem_;
+  std::shared_ptr<mkldnn::memory> dst_mem_;
+
+  std::shared_ptr<mkldnn::stream> fwd_stream_;
+  std::vector<mkldnn::primitive> fwd_primitives_;
+
+  // desc & prmitive desc
+  std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc_;
+
+  // memory desc
+  std::shared_ptr<mkldnn::memory::desc> src_md_;
+  std::shared_ptr<mkldnn::memory::desc> filter_md_;
+  std::shared_ptr<mkldnn::memory::desc> bias_md_;
+  std::shared_ptr<mkldnn::memory::desc> dst_md_;
+
+  engine cpu_engine_ = engine(engine::cpu, 0);
+};
+
+template <typename T>
+class Conv2DFwdFactory : public DnnOpFactory<T> {
+ public:
+  static Conv2DFwd<T>* Get(const ConvFwdDimensions& convFwdDims) {
+     Conv2DFwd<T>* conv2d_fwd = nullptr;
+
+     // try to find a suitable one in pool
+     conv2d_fwd = dynamic_cast<Conv2DFwd<T>*> (
+       Conv2DFwdFactory<T>::GetInstance().GetConv2DFwd(convFwdDims));
+
+     if (conv2d_fwd == nullptr) {
+       conv2d_fwd = new Conv2DFwd<T>(convFwdDims);
+       Conv2DFwdFactory<T>::GetInstance().SetConv2DFwd(
+           convFwdDims, conv2d_fwd);
+     }
+     return conv2d_fwd;
+  }
+
+ private:
+  Conv2DFwdFactory() {}
+  ~Conv2DFwdFactory() {}
+
+  static const int kDilationH = 0, kDilationW = 1;
+
+  static Conv2DFwdFactory& GetInstance() {
+    static Conv2DFwdFactory instance_;
+    return instance_;
+  }
+
+  static std::string CreateKey(const ConvFwdDimensions& convFwdDims) {
+    std::string prefix = "conv2d_fwd_";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(convFwdDims.src_dims);
+    key_creator.AddAsKey(convFwdDims.filter_dims);
+    key_creator.AddAsKey(convFwdDims.bias_dims);
+    key_creator.AddAsKey(convFwdDims.dst_dims);
+    key_creator.AddAsKey(convFwdDims.strides);
+    key_creator.AddAsKey(convFwdDims.dilations);
+    key_creator.AddAsKey(convFwdDims.padding_left);
+    key_creator.AddAsKey(convFwdDims.padding_right);
+    return key_creator.GetKey();
+  }
+
+  DnnOp* GetConv2DFwd(const ConvFwdDimensions& convFwdDims) {
+    std::string key = CreateKey(convFwdDims);
+    return this->GetOp(key);
+  }
+
+  void SetConv2DFwd(const ConvFwdDimensions& convFwdDims, DnnOp *op) {
+    std::string key = CreateKey(convFwdDims);
+    this->SetOp(key, op);
+  }
+};
+
+#endif
+
 typedef Eigen::ThreadPoolDevice CPUDevice;

-// MKL-DNN is now default. MKL-ML must be specified explicitly.
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
 #ifdef INTEL_MKL_ML
-
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
 public:
@ -528,8 +749,6 @@ class MklConv2DOp : public OpKernel {

  void Compute(OpKernelContext* context) override {
    try {
-      auto cpu_engine = engine(engine::cpu, 0);
-
      // Input tensors
      const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
      const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
@ -538,16 +757,16 @@ class MklConv2DOp : public OpKernel {
      GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
      GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
      OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-                  errors::InvalidArgument("Filter should not be in "
-                                          "Mkl Layout"));
+            errors::InvalidArgument("Filter should not be in "
+            "Mkl Layout"));

      MklDnnData<T> src(&cpu_engine);
      MklDnnData<T> filter(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);  // output

-      memory::dims src_dims, filter_dims, padding_l, padding_r,
+      memory::dims src_dims, filter_dims, padding_left, padding_right,
                   dilations, strides;
-      memory::dims output_dims_tf_order, output_dims_mkl_order;
+      memory::dims dst_dims_tf_order, dst_dims_mkl_order;

      // Get shapes of input tensors in MKL-DNN order
      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
@ -555,31 +774,29 @@ class MklConv2DOp : public OpKernel {
      auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
      auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
      conv_utl.GetConvFwdSizesInMklOrder(
-          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
-          &dilations, &output_dims_tf_order, &output_dims_mkl_order,
-          &padding_l, &padding_r);
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims,
+          &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order,
+          &padding_left, &padding_right);
      if (!context->status().ok()) return;

      // Check for corner case - if there is nothing to compute, return.
-      TensorShape output_tf_shape = MklDnnDimsToTFShape(output_dims_tf_order);
+      TensorShape dst_tf_shape = MklDnnDimsToTFShape(dst_dims_tf_order);

      // Corner cases: output with 0 elements and 0 batch size.
-      Tensor* output_tensor = nullptr;
-      if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) {
-        // TODO(jbobba): Verify correctness here
-        //               Need semantics for Null MKL tensor
-        MklDnnShape output_mkl_shape;
-        output_mkl_shape.SetMklTensor(false);
-
-        AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor,
-                                  src_tf_shape, output_mkl_shape);
+      Tensor* dst_tensor = nullptr;
+      if (dst_tf_shape.num_elements() == 0 ||
+          dst_dims_tf_order[0] == 0) {
+        MklDnnShape dst_mkl_shape;
+        dst_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, kOutputIndex_Dst,
+                    &dst_tensor, src_tf_shape, dst_mkl_shape);

        // MklConv2D also outputs converted filter as 2nd output of Conv2D.
        filter_mkl_shape.SetMklTensor(false);
        Tensor* output_filter_tensor = nullptr;
        AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor, filter_tf_shape,
-                                  filter_mkl_shape);
+                                  &output_filter_tensor,
+                                  filter_tf_shape, filter_mkl_shape);
        return;
      }

@ -587,6 +804,7 @@ class MklConv2DOp : public OpKernel {
      // Describe how the inputs and outputs of Convolution look like. Also
      // specify buffers containing actual input and output data.
      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+
      // If input is in MKL layout, then simply grab input layout; otherwise,
      // construct input Tf layout. For TF layout, although input shape
      // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
@ -595,6 +813,7 @@ class MklConv2DOp : public OpKernel {
                        ? src_mkl_shape.GetMklLayout()
                        : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
      src.SetUsrMem(src_md, &src_tensor);
+
      // Although filter shape (filter_dims) required is in MKL-DNN order,
      // the layout is Tensorflow's layout (HWIO).
      auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
@ -603,98 +822,70 @@ class MklConv2DOp : public OpKernel {
                                          memory::format::hwio);
      filter.SetUsrMem(filter_md, &filter_tensor);

-      // Set output shape (output_dims) required in MKL-DNN order.
-      // Currently, we set output layout as Tensorflow's layout (NHWC or NCHW
-      // depending on data format). But later we propagate Mkl layout of the
-      // output to the next op directly.
-      output.SetUsrMem(output_dims_mkl_order, tf_fmt);
-
-      // Create memory descriptors for convolution data w/ no specified format.
-      src.SetOpMemDesc(src_dims, memory::format::any);
-      filter.SetOpMemDesc(filter_dims, memory::format::any);
-      output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
-
      // MKLDNN dilation starts from 0.
      dilations[kDilationH] -= 1;
      dilations[kDilationW] -= 1;

+      // get a conv2d fwd from primitive pool
+      Conv2DFwd<T> *conv2d_fwd = nullptr;
      if (biasEnabled) {
-          // Create convolution primitive with Bias.
-          MklDnnData<T> bias(&cpu_engine);
-          memory::dims bias_size;
-          conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_size);
-          const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
-          bias.SetUsrMem(bias_size, memory::format::x, &bias_tensor);
-          bias.SetOpMemDesc(bias_size, memory::format::any);
-
-          // Create convolution primitive with Bias.
-          // Use MKLDNN dilated convolution in case of dilated rate (>0).
-          auto conv_desc = (dilations[kDilationH] > 0 ||
-              dilations[kDilationW] > 0) ?
-              convolution_forward::desc(prop_kind::forward,
-                      convolution_direct, src.GetOpMemDesc(),
-                      filter.GetOpMemDesc(), bias.GetOpMemDesc(),
-                      output.GetOpMemDesc(), strides, dilations,
-                      padding_l, padding_r,
-                      TFPaddingToMklDnnPadding(padding_)):
-              convolution_forward::desc(prop_kind::forward,
-                      convolution_direct, src.GetOpMemDesc(),
-                      filter.GetOpMemDesc(), bias.GetOpMemDesc(),
-                      output.GetOpMemDesc(), strides,
-                      padding_l, padding_r,
-                      TFPaddingToMklDnnPadding(padding_));
-
-          auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                  cpu_engine);
-          AllocateOutputTensor(context, conv_prim_desc,
-                               output_dims_mkl_order, tf_fmt, &output_tensor);
-          // Set data handle for output.
-          output.SetUsrMemDataHandle(output_tensor);
-
-          Tensor* filter_out_tensor = nullptr;
-          AllocateFilterOutputTensor(context, conv_prim_desc,
-                TFShapeToMklDnnDims(filter_tf_shape),
-                &filter_out_tensor);
-
-          PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output,
-                               filter_out_tensor);
+        memory::dims bias_dims = {};
+        conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
+        ConvFwdDimensions convFwdDims(src_dims, filter_dims, bias_dims,
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
+        conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
      } else {
-          // Create convolution primitive without Bias.
-          // Use MKLDNN dilated convolution in case of dilated rate (>0).
-          auto conv_desc = (dilations[kDilationH] > 0 ||
-            dilations[kDilationW] > 0) ?
-            convolution_forward::desc(prop_kind::forward,
-              convolution_direct, src.GetOpMemDesc(),
-              filter.GetOpMemDesc(), output.GetOpMemDesc(),
-              strides, dilations, padding_l, padding_r,
-              TFPaddingToMklDnnPadding(padding_)):
-          convolution_forward::desc(prop_kind::forward,
-              convolution_direct, src.GetOpMemDesc(),
-              filter.GetOpMemDesc(), output.GetOpMemDesc(),
-              strides, padding_l, padding_r,
-              TFPaddingToMklDnnPadding(padding_));
-
-          auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                  cpu_engine);
-          AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
-                               tf_fmt, &output_tensor);
-          // Set data handle for output.
-          output.SetUsrMemDataHandle(output_tensor);
-
-          Tensor* filter_out_tensor = nullptr;
-          AllocateFilterOutputTensor(context, conv_prim_desc,
-                TFShapeToMklDnnDims(filter_tf_shape),
-                &filter_out_tensor);
-          PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                              nullptr, &output, filter_out_tensor);
+        ConvFwdDimensions convFwdDims(src_dims, filter_dims, NONE_DIMS,
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
+        conv2d_fwd = Conv2DFwdFactory<T>::Get(convFwdDims);
      }
-    } catch (mkldnn::error& e) {
+
+      // allocate output tensors output_tensor and filter_out_tensor
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
+      conv_fwd_pd = conv2d_fwd->fwd_pd_;
+      AllocateOutputTensor(context, *conv_fwd_pd,
+                       dst_dims_mkl_order, tf_fmt, &dst_tensor);
+      Tensor* filter_out_tensor = nullptr;
+      AllocateFilterOutputTensor(context, *conv_fwd_pd,
+                                 TFShapeToMklDnnDims(filter_tf_shape),
+                                 &filter_out_tensor);
+
+      T* dst_data = static_cast<T*>(dst_tensor->flat<T>().data());
+
+      // check whether src/filter need reorder
+      std::vector<primitive> net;
+      if (src_md.data.format != conv2d_fwd->src_fmt_)
+          src.CheckReorderToOpMem(
+              conv_fwd_pd.get()->src_primitive_desc(), &net);
+
+      if (filter_md.data.format != conv2d_fwd->filter_fmt_)
+          filter.CheckReorderToOpMem(
+              conv_fwd_pd.get()->weights_primitive_desc(),
+              filter.GetTensorBuffer(filter_out_tensor), &net);
+      stream(stream::kind::eager).submit(net).wait();
+
+      T* src_data = static_cast<T*>(
+                src.GetOpMem().get_data_handle());
+      T* filter_data = static_cast<T*>(
+                filter.GetOpMem().get_data_handle());
+
+      // execute convolution
+      if (biasEnabled) {
+        const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
+        T* bias_data = static_cast<T*>(const_cast<T*>(
+            bias_tensor.flat<T>().data()));
+
+        conv2d_fwd->Execute(src_data, filter_data, bias_data, dst_data);
+      } else {
+        conv2d_fwd->Execute(src_data, filter_data, dst_data);
+      }
+    } catch (mkldnn::error &e) {
      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + std::string(e.message) + ", in file " +
-                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
-      OP_REQUIRES_OK(
-          context,
-          errors::Aborted("Operation received an exception:", error_msg));
+                       ", message: " + std::string(e.message) +
+                       ", in file " + std::string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+        errors::Aborted("Operation received an exception:", error_msg));
    }
  }

@ -706,6 +897,7 @@ class MklConv2DOp : public OpKernel {
  const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
  const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
  const int kDilationH = 0, kDilationW = 1;
+  engine cpu_engine = engine(engine::cpu, 0);

  // Allocate output tensor.
  void AllocateOutputTensor(
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@ -241,6 +241,7 @@ class ScatterNdUpdateOp : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_CPU);
+TF_CALL_string(REGISTER_SCATTER_ND_CPU);

 // Registers GPU kernels.
 #if GOOGLE_CUDA
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@ -160,6 +160,7 @@ struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM> {
  REGISTER_SCATTER_ND_INDEX(type, scatter_nd_op::UpdateOp::SUB);

 TF_CALL_ALL_TYPES(REGISTER_SCATTER_ND_UPDATE);
+REGISTER_SCATTER_ND_INDEX(string, scatter_nd_op::UpdateOp::ADD);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH)

 #undef REGISTER_SCATTER_ND_MATH
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@ -16,35 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_

-
-// This file requires the following include because it uses CudaAtomicMax:
-// #include "tensorflow/core/util/cuda_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and CudaAtomicMax is used in template context.
-
-// This file requires the following include because it uses CudaAtomicMax:
-// #include "tensorflow/core/util/cuda_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and CudaAtomicMax is used in template context.
-
-// This file requires the following include because it uses CudaAtomicMax:
-// #include "tensorflow/core/util/cuda_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and CudaAtomicMax is used in template context.
-
-// This file requires the following include because it uses CudaAtomicMax:
-// #include "tensorflow/core/util/cuda_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and CudaAtomicMax is used in template context.
-
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"

--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
@ -23,7 +23,7 @@ limitations under the License.
 #if defined(WIN32)
 #include "extras/CUPTI/include/cupti.h"
 #else
-#include "cuda/extras/CUPTI/include/cupti.h"
+#include "cupti.h"
 #endif
 namespace perftools {
 namespace gputools {
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@ -24,7 +24,7 @@ limitations under the License.

 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX "-rc1"

 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@ -19,6 +19,8 @@ limitations under the License.

 #include <string>
 #include <vector>
+#include <unordered_map>
+#include <utility>

 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@ -1759,7 +1761,90 @@ class MklDnnData {
  }
 };

-#endif  // INTEL_MKL_ML
+/// Base class for operations with reuse of DNN primitives
+///
+class DnnOp {
+ public:
+  virtual ~DnnOp() {}
+
+  // Dummy data. Its size, hard-coded as 256 here, does
+  // not matter since MKL should never operate on this buffer.
+  unsigned char DummyData[256];
+};
+
+const mkldnn::memory::dims NONE_DIMS = {};
+// This constant is used to declare dummy buffer (size), for MKL primitives
+template <typename T>
+class DnnOpFactory {
+ public:
+  DnnOpFactory() {}
+  ~DnnOpFactory() {}
+
+  DnnOp* GetOp(const std::string& key) {
+    auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
+    if (stream_iter == DnnOpFactory<T>::GetHashMap().end()) {
+      return nullptr;
+    } else {
+      return stream_iter->second;
+    }
+  }
+
+  void SetOp(const std::string& key, DnnOp* op) {
+    auto stream_iter = DnnOpFactory<T>::GetHashMap().find(key);
+
+    CHECK(stream_iter == DnnOpFactory<T>::GetHashMap().end());
+
+    DnnOpFactory<T>::GetHashMap()[key] = op;
+  }
+
+ private:
+  static inline std::unordered_map<std::string, DnnOp*> &GetHashMap() {
+    static thread_local std::unordered_map<std::string, DnnOp*> map_;
+    return map_;
+  }
+};
+
+// utility class for creating keys of MKL primitive pool.
+class FactoryKeyCreator {
+ public:
+  FactoryKeyCreator() {
+    key_.reserve(kMaxKeyLength);
+  }
+
+  ~FactoryKeyCreator() {}
+
+  void AddAsKey(const string &str) {
+    auto buffer = reinterpret_cast<const char *>(str.c_str());
+    Append(buffer, str.length());
+  }
+
+  void AddAsKey(const mkldnn::memory::dims &dims) {
+    for (unsigned int i = 0; i < dims.size(); i++) {
+      AddAsKey<int>(dims[i]);
+    }
+  }
+
+  template <typename T>
+  void AddAsKey(const T data) {
+    auto buffer = reinterpret_cast<const char *>(&data);
+    Append(buffer, sizeof(T));
+  }
+
+  std::string GetKey() {
+    return key_;
+  }
+
+ private:
+  string key_;
+  const char delimiter = 'x';
+  const int kMaxKeyLength = 256;
+  void Append(const char* data, int len) {
+    key_.append(data, len);
+    key_.append(1, delimiter);
+  }
+};
+
+#endif  // INTEL_MKL_DNN

 }  // namespace tensorflow
 #endif  // INTEL_MKL
--- a/tensorflow/docs_src/community/roadmap.md
+++ b/tensorflow/docs_src/community/roadmap.md
@ -1,5 +1,5 @@
 # Roadmap
-**Last updated: Feb 15, 2018**
+**Last updated: Apr 27, 2018**

 TensorFlow is a rapidly moving, community supported project. This document is intended 
 to provide guidance about priorities and focus areas of the core set of TensorFlow 
@ -14,12 +14,12 @@ expected in the next one to two releases.

 ### APIs
 #### High Level APIs:
-* Easy multi-GPU utilization with Estimators
+* Easy multi-GPU and TPU utilization with Estimators
 * Easy-to-use high-level pre-made estimators for Gradient Boosted Trees, Time Series, and other models

 #### Eager Execution:
 * Efficient utilization of multiple GPUs
-* Distributed training (multi-machine)
+* Distributed training support (multi-machine)
 * Performance improvements
 * Simpler export to a GraphDef/SavedModel 

@ -31,14 +31,14 @@ to create Keras models Eager- style via Model subclassing)

 #### Official Models:
 * A set of 
-[reference models](https://github.com/tensorflow/models/tree/master/official) 
+[models](https://github.com/tensorflow/models/tree/master/official) 
 across image recognition, speech, object detection, and 
  translation that demonstrate best practices and serve as a starting point for 
  high-performance model development.

 #### Contrib:
-* Deprecation notices added to parts of tf.contrib where preferred implementations exist outside of tf.contrib.
-* As much as possible, large projects inside tf.contrib moved to separate repositories.
+* Deprecate parts of tf.contrib where preferred implementations exist outside of tf.contrib.
+* As much as possible, move large projects inside tf.contrib to separate repositories.
 * The tf.contrib module will eventually be discontinued in its current form, experimental development will in future happen in other repositories.


@ -50,36 +50,72 @@ across image recognition, speech, object detection, and

 ### Platforms
 #### TensorFlow Lite:
-* Increased coverage of supported ops in TensorFlow Lite
+* Increase coverage of supported ops in TensorFlow Lite
 * Easier conversion of a trained TensorFlow graph for use on TensorFlow Lite
 * Support for GPU acceleration in TensorFlow Lite (iOS and Android)
 * Support for hardware accelerators via Android NeuralNets API 
-* Improved CPU performance by quantization and other network optimizations (eg. pruning, distillation)
-* Increased support for devices beyond Android and iOS (eg. RPi, Cortex-M)
+* Improve CPU performance by quantization and other network optimizations (eg. pruning, distillation)
+* Increase support for devices beyond Android and iOS (eg. RPi, Cortex-M)
+
+#### TensorFlow.js:
+* Release package for Node.js bindings to the TensorFlow C API through the TensorFlow.js backend interface
+* Expand support for importing TensorFlow SavedModels and Keras models into browser with unified APIs supporting retraining in browser
+* Improve Layers API and allow model exporting/saving
+* Release tfjs-data API for efficient data input pipelines
+
+#### TensorFlow with Swift:
+* Establish open source project including documentation, open design, and code availability.
+* Continue implementing and refining implementation and design through 2018.
+* Aim for implementation to be solid enough for general use later in 2018.

 ### Performance
 #### Distributed TensorFlow:
-* Multi-GPU support optimized for a variety of GPU topologies
-* Improved mechanisms for distributing computations on several machines
+* Optimize Multi-GPU support for a variety of GPU topologies
+* Improve mechanisms for distributing computations on several machines

-#### Optimizations:
-* Mixed precision training support with initial example model and guide
-* Native TensorRT support
+#### GPU Optimizations:
+* Simplify mixed precision API with initial example model and guide.
+* Finalize TensorRT API and move to core.
+* CUDA 9.2 and NCCL 2.x default in TensorFlow builds.
+* Optimizations for DGX-2.
+* Remove support for CUDA less than 8.x and cuDNN less than 6.x.
+
+
+#### CPU Optimizations
 * Int8 support for SkyLake via MKL
 * Dynamic loading of SIMD-optimized kernels
+* MKL for Linux and Windows
+
+### End-to-end ML systems:
+#### TensorFlow Hub:
+* Expand support for module-types in TF Hub with TF Eager integration, Keras layers integration, and TensorFlow.js integration
+* Accept variable-sized image input
+* Improve multi-GPU estimator support
+* Document and improve TPU integration
+
+#### TensorFlow Extended:
+* Open source more of the TensorFlow Extended platform to facilitate adoption of TensorFlow in production settings.
+* Release TFX libraries for Data Validation
+
+### Documentation and Resources:
+* Update documentation, tutorials and Getting Started guides on all features and APIs
+* Update [Youtube Tensorflow channel](https://youtube.com/tensorflow) weekly with new content:
+Coding TensorFlow - where we teach folks coding with tensorflow
+TensorFlow Meets - where we highlight community contributions
+Ask TensorFlow - where we answer community questions
+Guest and Showcase videos
+* Update [Official TensorFlow blog](https://blog.tensorflow.org) with regular articles from Google team and the Community

-### Documentation and Usability:
-* Updated documentation, tutorials and Getting Started guides
-* Process to enable external contributions to tutorials, documentation, and blogs showcasing best practice use-cases of TensorFlow and high-impact applications

 ### Community and Partner Engagement
 #### Special Interest Groups: 
-* Mobilizing the community to work together in focused domains
+* Mobilize the community to work together in focused domains
 * [tf-distribute](https://groups.google.com/a/tensorflow.org/forum/#!forum/tf-distribute): build and packaging of TensorFlow
-* More to be identified and launched
+* SIG TensorBoard, SIG Rust, and more to be identified and launched

 #### Community:
 * Incorporate public feedback on significant design decisions via a Request-for-Comment (RFC) process
 * Formalize process for external contributions to land in TensorFlow and associated projects 
 * Grow global TensorFlow communities and user groups
 * Collaborate with partners to co-develop and publish research papers
+* Process to enable external contributions to tutorials, documentation, and blogs showcasing best practice use-cases of TensorFlow and high-impact applications
--- a/tensorflow/docs_src/get_started/checkpoints.md
+++ b/tensorflow/docs_src/get_started/checkpoints.md
@ -38,8 +38,10 @@ Estimators automatically write the following to disk:
    uses to create visualizations.

 To specify the top-level directory in which the Estimator stores its
-information, assign a value to the optional `model_dir` argument of any
-Estimator's constructor.  For example, the following code sets the `model_dir`
+information, assign a value to the optional `model_dir` argument of *any*
+`Estimator`'s constructor.
+Taking `DNNClassifier` as an example,
+the following code sets the `model_dir`
 argument to the `models/iris` directory:

 ```python
--- a/tensorflow/docs_src/get_started/feature_columns.md
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@ -138,7 +138,7 @@ The model will represent the buckets as follows:
 |< 1960               | [1, 0, 0, 0] |
 |>= 1960 but < 1980   | [0, 1, 0, 0] |
 |>= 1980 but < 2000   | [0, 0, 1, 0] |
-|> 2000               | [0, 0, 0, 1] |
+|>= 2000              | [0, 0, 0, 1] |

 Why would you want to split a number—a perfectly valid input to your
 model—into a categorical value? Well, notice that the categorization splits a
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@ -10,7 +10,7 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.

-The easiest way to get started with tensorflow is using Eager Execution.
+The easiest way to get started with TensorFlow is using Eager Execution.

  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.

--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@ -38,7 +38,7 @@ enable TensorFlow for C:
         OS="linux" # Change to "darwin" for macOS
         TARGET_DIRECTORY="/usr/local"
         curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
           sudo tar -C $TARGET_DIRECTORY -xz

     The `tar` command extracts the TensorFlow C library into the `lib`
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
         TF_TYPE="cpu" # Change to "gpu" for GPU support
         TARGET_DIRECTORY='/usr/local'
         curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc1.tar.gz" |
         sudo tar -C $TARGET_DIRECTORY -xz

     The `tar` command extracts the TensorFlow C library into the `lib`
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
  <groupId>org.tensorflow</groupId>
  <artifactId>tensorflow</artifactId>
-  <version>1.8.0-rc0</version>
+  <version>1.8.0-rc1</version>
 </dependency>
 ```

@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
               <dependency>
                 <groupId>org.tensorflow</groupId>
                 <artifactId>tensorflow</artifactId>
-                 <version>1.8.0-rc0</version>
+                 <version>1.8.0-rc1</version>
               </dependency>
             </dependencies>
         </project>
@ -124,12 +124,12 @@ instead:
 <dependency>
  <groupId>org.tensorflow</groupId>
  <artifactId>libtensorflow</artifactId>
-  <version>1.8.0-rc0</version>
+  <version>1.8.0-rc1</version>
 </dependency>
 <dependency>
  <groupId>org.tensorflow</groupId>
  <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.8.0-rc0</version>
+  <version>1.8.0-rc1</version>
 </dependency>
 ```

@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:

  1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
     which is the TensorFlow Java Archive (JAR).

  2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
         OS=$(uname -s | tr '[:upper:]' '[:lower:]')
         mkdir -p ./jni
         curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
           tar -xz -C ./jni

 ### Install on Windows
@ -175,10 +175,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:

  1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
     which is the TensorFlow Java Archive (JAR).
  2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc1.zip).
  3. Extract this .zip file.


@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:

-<pre><b>javac -cp libtensorflow-1.8.0-rc0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0-rc1.jar HelloTF.java</b></pre>


 ### Running
@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:

-<pre><b>java -cp libtensorflow-1.8.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>

 And the following command line executes the `HelloTF` program on Windows:

-<pre><b>java -cp libtensorflow-1.8.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>

 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@ -1,139 +1,266 @@
 # Installing TensorFlow on Ubuntu

-This guide explains how to install TensorFlow on Ubuntu. Although these
-instructions might also work on other Linux variants, we have only
-tested (and we only support) these instructions on machines meeting the
-following requirements:
+This guide explains how to install TensorFlow on Ubuntu Linux. While these
+instructions may work on other Linux variants, they are tested and supported with
+the following system requirements:

-  * 64-bit desktops or laptops
-  * Ubuntu 16.04 or higher
+* 64-bit desktops or laptops
+* Ubuntu 16.04 or higher


-## Determine which TensorFlow to install
+## Choose which TensorFlow to install

-You must choose one of the following types of TensorFlow to install:
+The following TensorFlow variants are available for installation:

-  * **TensorFlow with CPU support only**. If your system does not have a
-    NVIDIA® GPU, you must install this version. Note that this version of
-    TensorFlow is typically much easier to install (typically,
-    in 5 or 10 minutes), so even if you have an NVIDIA GPU, we recommend
-    installing this version first.
-  * **TensorFlow with GPU support**. TensorFlow programs typically run
-    significantly faster on a GPU than on a CPU. Therefore, if your
-    system has a NVIDIA® GPU meeting the prerequisites shown below and you
-    need to run performance-critical applications, you should ultimately
-    install this version.
-
-<a name="NVIDIARequirements"></a>
-### NVIDIA requirements to run TensorFlow with GPU support
-
-If you are installing TensorFlow with GPU support using one of the
-mechanisms described in this guide, then the following NVIDIA software
-must be installed on your system:
-
-  * [CUDA Toolkit 9.0](http://nvidia.com/cuda). For details, see
-    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
-    Ensure that you append the relevant CUDA pathnames to the
-    `LD_LIBRARY_PATH` environment variable as described in the
-    NVIDIA documentation.
-  * [cuDNN SDK v7](http://developer.nvidia.com/cudnn). For details, see
-    [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/).
-    Ensure that you create the `CUDA_HOME` environment variable as
-    described in the NVIDIA documentation.
-  * GPU card with CUDA Compute Capability 3.0 or higher for building
-    from source and 3.5 or higher for our binaries. See
-    [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for
-    a list of supported GPU cards.
-  * [GPU drivers](http://nvidia.com/driver) supporting your version of the CUDA
-    Toolkit.
-  * The libcupti-dev library, which is the NVIDIA CUDA Profile Tools Interface.
-    This library provides advanced profiling support. To install this library,
-    issue the following command for CUDA Toolkit >= 8.0:
-
-    <pre>
-    $ <b>sudo apt-get install cuda-command-line-tools</b>
-    </pre>
-
-    and add its path to your `LD_LIBRARY_PATH` environment variable:
-
-    <pre>
-    $ <b>export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</b>
-    </pre>
-
-    For CUDA Toolkit <= 7.5 do:
-
-    <pre>
-    $ <b>sudo apt-get install libcupti-dev</b>
-    </pre>
-
-  * **[OPTIONAL]**  For optimized inferencing performance, you can also install
-    **NVIDIA TensorRT 3.0**. The minimal set of TensorRT runtime components needed
-    for use with the pre-built `tensorflow-gpu` package can be installed as follows:
-
-    <pre>
-    $ <b>wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</b>
-    $ <b>sudo dpkg -i nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</b>
-    $ <b>sudo apt-get update</b>
-    $ <b>sudo apt-get install -y --allow-downgrades libnvinfer-dev libcudnn7-dev=7.0.5.15-1+cuda9.0 libcudnn7=7.0.5.15-1+cuda9.0</b>
-    </pre>
-
-    **IMPORTANT:** For compatibility with the pre-built `tensorflow-gpu`
-    package, please use the Ubuntu **14.04** package of TensorRT as shown above,
-    even when installing onto an Ubuntu 16.04 system.<br/>
-    <br/>
-    To build the TensorFlow-TensorRT integration module from source rather than
-    using pre-built binaries, see the [module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#using-tensorrt-in-tensorflow).
-    For detailed TensorRT installation instructions, see [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html).<br/>
-    <br/>
-    To avoid cuDNN version conflicts during later system upgrades, you can hold
-    the cuDNN version at 7.0.5:
-
-    <pre>
-    $ <b> sudo apt-mark hold libcudnn7 libcudnn7-dev</b>
-    </pre>
-
-    To later allow upgrades, you can remove the hold:
-
-    <pre>
-    $ <b> sudo apt-mark unhold libcudnn7 libcudnn7-dev</b>
-    </pre>
-
-If you have an earlier version of the preceding packages, please upgrade to
-the specified versions. If upgrading is not possible, then you may still run
-TensorFlow with GPU support, if you @{$install_sources$install TensorFlow from Sources}.
+* __TensorFlow with CPU support only__. If your system does not have a
+  NVIDIA®&nbsp;GPU, you must install this version. This version of TensorFlow is
+  usually easier to install, so even if you have an NVIDIA GPU, we recommend
+  installing this version first.
+* __TensorFlow with GPU support__. TensorFlow programs usually run much faster on
+  a GPU instead of a CPU. If you run performance-critical applications and your
+  system has an NVIDIA®&nbsp;GPU that meets the prerequisites, you should install
+  this version. See [TensorFlow GPU support](#NVIDIARequirements) for details.


-## Determine how to install TensorFlow
+## How to install TensorFlow

-You must pick the mechanism by which you install TensorFlow. The
-supported choices are as follows:
+There are a few options to install TensorFlow on your machine:

-  * [Virtualenv](#InstallingVirtualenv)
-  * ["native" pip](#InstallingNativePip)
-  * [Docker](#InstallingDocker)
-  * [Anaconda](#InstallingAnaconda)
-  * installing from sources, which is documented in
-    [a separate guide](https://www.tensorflow.org/install/install_sources).
+* [Use pip in a virtual environment](#InstallingVirtualenv) *(recommended)*
+* [Use pip in your system environment](#InstallingNativePip)
+* [Configure a Docker container](#InstallingDocker)
+* [Use pip in Anaconda](#InstallingAnaconda)
+* [Install TensorFlow from source](/install/install_sources)

-**We recommend the Virtualenv installation.**
-[Virtualenv](https://virtualenv.pypa.io/en/stable/)
-is a virtual Python environment isolated from other Python development,
-incapable of interfering with or being affected by other Python programs
-on the same machine.  During the Virtualenv installation process,
-you will install not only TensorFlow but also all the packages that
-TensorFlow requires.  (This is actually pretty easy.)
-To start working with TensorFlow, you simply need to "activate" the
-virtual environment.  All in all, Virtualenv provides a safe and
-reliable mechanism for installing and running TensorFlow.
+<a name="InstallingVirtualenv"></a>
+### Use `pip` in a virtual environment

-Native pip installs TensorFlow directly on your system without going
-through any container system. **We recommend the native pip install for
-system administrators aiming to make TensorFlow available to everyone on a
-multi-user system.** Since a native pip installation is not walled-off in
-a separate container, the pip installation might interfere with other
-Python-based installations on your system. However, if you understand pip
-and your Python environment, a native pip installation often entails only
-a single command.
+Key Point: Using a virtual environment is the recommended install method.
+
+The [Virtualenv](https://virtualenv.pypa.io/en/stable/) tool creates virtual
+Python environments that are isolated from other Python development on the same
+machine. In this scenario, you install TensorFlow and its dependencies within a
+virtual environment that is available when *activated*. Virtualenv provides a
+reliable way to install and run TensorFlow while avoiding conflicts with the rest
+of the system.
+
+##### 1. Install Python, `pip`, and `virtualenv`.
+
+On Ubuntu, Python is automatically installed and `pip` is *usually* installed.
+Confirm the `python` and `pip` versions:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">python -V  # or: python3 -V</code>
+  <code class="devsite-terminal">pip -V     # or: pip3 -V</code>
+</pre>
+
+To install these packages on Ubuntu:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install python-pip python-dev python-virtualenv   # for Python 2.7</code>
+  <code class="devsite-terminal">sudo apt-get install python3-pip python3-dev python-virtualenv # for Python 3.n</code>
+</pre>
+
+We *recommend* using `pip` version 8.1 or higher. If using a release before
+version 8.1,  upgrade `pip`:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install -U pip</code>
+</pre>
+
+If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is
+installed, use `easy_install` to install `pip`:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">easy_install -U pip</code>
+</pre>
+
+##### 2. Create a directory for the virtual environment and choose a Python interpreter.
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">mkdir ~/tensorflow  # somewhere to work out of</code>
+  <code class="devsite-terminal">cd ~/tensorflow</code>
+  <code># Choose one of the following Python environments for the ./venv directory:</code>
+  <code class="devsite-terminal">virtualenv --system-site-packages <var>venv</var>            # Use python default (Python 2.7)</code>
+  <code class="devsite-terminal">virtualenv --system-site-packages -p python3 <var>venv</var> # Use Python 3.n</code>
+</pre>
+
+##### 3. Activate the Virtualenv environment.
+
+Use one of these shell-specific commands to activate the virtual environment:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">source ~/tensorflow/<var>venv</var>/bin/activate      # bash, sh, ksh, or zsh</code>
+  <code class="devsite-terminal">source ~/tensorflow/<var>venv</var>/bin/activate.csh  # csh or tcsh</code>
+  <code class="devsite-terminal">. ~/tensorflow/<var>venv</var>/bin/activate.fish      # fish</code>
+</pre>
+
+When the Virtualenv is activated, the shell prompt displays as `(venv) $`.
+
+##### 4. Upgrade `pip` in the virtual environment.
+
+Within the active virtual environment, upgrade `pip`:
+
+<pre class="prettyprint lang-bsh">
+(venv)$ pip install -U pip
+</pre>
+
+You can install other Python packages within the virtual environment without
+affecting packages outside the `virtualenv`.
+
+##### 5. Install TensorFlow in the virtual environment.
+
+Choose one of the available TensorFlow packages for installation:
+
+* `tensorflow` —Current release for CPU
+* `tensorflow-gpu` —Current release with GPU support
+* `tf-nightly` —Nightly build for CPU
+* `tf-nightly-gpu` —Nightly build with GPU support
+
+Within an active Virtualenv environment, use `pip` to install the package:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">pip install -U tensorflow</code>
+</pre>
+
+Use `pip list` to show the packages installed in the virtual environment.
+[Validate the install](#ValidateYourInstallation) and test the version:
+
+<pre class="prettyprint lang-bsh">
+(venv)$ python -c "import tensorflow as tf; print(tf.__version__)"
+</pre>
+
+Success: TensorFlow is now installed.
+
+Use the `deactivate` command to stop the Python virtual environment.
+
+#### Problems
+
+If the above steps failed, try installing the TensorFlow binary using the remote
+URL of the `pip` package:
+
+<pre class="prettyprint lang-bsh">
+(venv)$ pip install --upgrade <var>remote-pkg-URL</var>   # Python 2.7
+(venv)$ pip3 install --upgrade <var>remote-pkg-URL</var>  # Python 3.n
+</pre>
+
+The <var>remote-pkg-URL</var> depends on the operating system, Python version,
+and GPU support. See [here](#the_url_of_the_tensorflow_python_package) for the
+URL naming scheme and location.
+
+See [Common Installation Problems](#common_installation_problems) if you
+encounter problems.
+
+#### Uninstall TensorFlow
+
+To uninstall TensorFlow, remove the Virtualenv directory you created in step 2:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">deactivate  # stop the virtualenv</code>
+  <code class="devsite-terminal">rm -r ~/tensorflow/<var>venv</var></code>
+</pre>
+
+
+<a name="InstallingNativePip"></a>
+### Use `pip` in your system environment
+
+Use `pip` to install the TensorFlow package directly on your system without
+using a container or virtual environment for isolation. This method is
+recommended for system administrators that want a TensorFlow installation that is
+available to everyone on a multi-user system.
+
+Since a system install is not isolated, it could interfere with other
+Python-based installations. But if you understand `pip` and your Python
+environment, a system `pip` install is straightforward.
+
+See the
+[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
+for a list of packages that TensorFlow installs.
+
+##### 1. Install Python, `pip`, and `virtualenv`.
+
+On Ubuntu, Python is automatically installed and `pip` is *usually* installed.
+Confirm the `python` and `pip` versions:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">python -V  # or: python3 -V</code>
+  <code class="devsite-terminal">pip -V     # or: pip3 -V</code>
+</pre>
+
+To install these packages on Ubuntu:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install python-pip python-dev   # for Python 2.7</code>
+  <code class="devsite-terminal">sudo apt-get install python3-pip python3-dev # for Python 3.n</code>
+</pre>
+
+We *recommend* using `pip` version 8.1 or higher. If using a release before
+version 8.1,  upgrade `pip`:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install -U pip</code>
+</pre>
+
+If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is
+installed, use `easy_install` to install `pip`:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">easy_install -U pip</code>
+</pre>
+
+##### 2. Install TensorFlow on system.
+
+Choose one of the available TensorFlow packages for installation:
+
+* `tensorflow` —Current release for CPU
+* `tensorflow-gpu` —Current release with GPU support
+* `tf-nightly` —Nightly build for CPU
+* `tf-nightly-gpu` —Nightly build with GPU support
+
+And use `pip` to install the package for Python 2 or 3:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install -U tensorflow   # Python 2.7</code>
+  <code class="devsite-terminal">sudo pip3 install -U tensorflow  # Python 3.n</code>
+</pre>
+
+Use `pip list` to show the packages installed on the system.
+[Validate the install](#ValidateYourInstallation) and test the version:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">python -c "import tensorflow as tf; print(tf.__version__)"</code>
+</pre>
+
+Success: TensorFlow is now installed.
+
+#### Problems
+
+If the above steps failed, try installing the TensorFlow binary using the remote
+URL of the `pip` package:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip install --upgrade <var>remote-pkg-URL</var>   # Python 2.7</code>
+  <code class="devsite-terminal">sudo pip3 install --upgrade <var>remote-pkg-URL</var>  # Python 3.n</code>
+</pre>
+
+The <var>remote-pkg-URL</var> depends on the operating system, Python version,
+and GPU support. See [here](#the_url_of_the_tensorflow_python_package) for the
+URL naming scheme and location.
+
+See [Common Installation Problems](#common_installation_problems) if you
+encounter problems.
+
+#### Uninstall TensorFlow
+
+To uninstall TensorFlow on your system, use one of following commands:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo pip uninstall tensorflow   # for Python 2.7</code>
+  <code class="devsite-terminal">sudo pip3 uninstall tensorflow  # for Python 3.n</code>
+</pre>
+
+<a name="InstallingDocker"></a>
+### Configure a Docker container

 Docker completely isolates the TensorFlow installation
 from pre-existing packages on your machine. The Docker container contains
@ -142,210 +269,6 @@ large (hundreds of MBs). You might choose the Docker installation if you are
 incorporating TensorFlow into a larger application architecture that already
 uses Docker.

-In Anaconda, you may use conda to create a virtual environment.
-However, within Anaconda, we recommend installing TensorFlow with the
-`pip install` command, not with the `conda install` command.
-
-**NOTE:** The conda package is community supported, not officially supported.
-That is, the TensorFlow team neither tests nor maintains the conda package.
-Use that package at your own risk.
-
-
-<a name="InstallingVirtualenv"></a>
-## Installing with Virtualenv
-
-Take the following steps to install TensorFlow with Virtualenv:
-
-  1. Install pip and Virtualenv by issuing one of the following commands:
-
-     <pre>$ <b>sudo apt-get install python-pip python-dev python-virtualenv</b> # for Python 2.7
-    $ <b>sudo apt-get install python3-pip python3-dev python-virtualenv</b> # for Python 3.n</pre>
-
-  2. Create a Virtualenv environment by issuing one of the following commands:
-
-     <pre>$ <b>virtualenv --system-site-packages</b> <i>targetDirectory</i> # for Python 2.7
-    $ <b>virtualenv --system-site-packages -p python3</b> <i>targetDirectory</i> # for Python 3.n</pre>
-
-     where <code><em>targetDirectory</em></code> specifies the top of the
-     Virtualenv tree.  Our instructions assume that
-     <code><em>targetDirectory</em></code> is `~/tensorflow`, but you may
-     choose any directory.
-
-  3. Activate the Virtualenv environment by issuing one of the following
-     commands:
-
-     <pre>$ <b>source ~/tensorflow/bin/activate</b> # bash, sh, ksh, or zsh
-    $ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh
-    $ <b>. ~/tensorflow/bin/activate.fish</b>  # fish</pre>
-
-     The preceding <tt>source</tt> command should change your prompt
-     to the following:
-
-     <pre>(tensorflow)$ </pre>
-
-  4. Ensure pip ≥8.1 is installed:
-
-     <pre>(tensorflow)$ <b>easy_install -U pip</b></pre>
-
-  5. Issue one of the following commands to install TensorFlow in the active
-     Virtualenv environment:
-
-     <pre>(tensorflow)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
-    (tensorflow)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
-    (tensorflow)$ <b>pip install --upgrade tensorflow-gpu</b>  # for Python 2.7 and GPU
-    (tensorflow)$ <b>pip3 install --upgrade tensorflow-gpu</b> # for Python 3.n and GPU</pre>
-
-     If the above command succeeds, skip Step 6. If the preceding
-     command fails, perform Step 6.
-
-  6. (Optional) If Step 5 failed (typically because you invoked a pip version
-     lower than 8.1), install TensorFlow in the active Virtualenv environment
-     by issuing a command of the following format:
-
-     <pre>(tensorflow)$ <b>pip install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
-    (tensorflow)$ <b>pip3 install --upgrade</b> <i>tfBinaryURL</i>  # Python 3.n </pre>
-
-     where <code><em>tfBinaryURL</em></code> identifies the URL of the
-     TensorFlow Python package. The appropriate value of
-     <code><em>tfBinaryURL</em></code>depends on the operating system,
-     Python version, and GPU support. Find the appropriate value for
-     <code><em>tfBinaryURL</em></code> for your system
-     [here](#the_url_of_the_tensorflow_python_package).  For example, if you
-     are installing TensorFlow for Linux, Python 3.4, and CPU-only support,
-     issue the following command to install TensorFlow in the active
-     Virtualenv environment:
-
-     <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
-
-If you encounter installation problems, see
-[Common Installation Problems](#common_installation_problems).
-
-
-### Next Steps
-
-After installing TensorFlow,
-[validate the installation](#ValidateYourInstallation).
-
-Note that you must activate the Virtualenv environment each time you
-use TensorFlow. If the Virtualenv environment is not currently active,
-invoke one of the following commands:
-
-<pre> $ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
-$ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh</pre>
-
-When the Virtualenv environment is active, you may run
-TensorFlow programs from this shell.  Your prompt will become
-the following to indicate that your tensorflow environment is active:
-
-<pre>(tensorflow)$ </pre>
-
-When you are done using TensorFlow, you may deactivate the
-environment by invoking the `deactivate` function as follows:
-
-<pre>(tensorflow)$ <b>deactivate</b> </pre>
-
-The prompt will revert back to your default prompt (as defined by the
-`PS1` environment variable).
-
-
-### Uninstalling TensorFlow
-
-To uninstall TensorFlow, simply remove the tree you created.
-For example:
-
-<pre>$ <b>rm -r</b> <i>targetDirectory</i> </pre>
-
-
-<a name="InstallingNativePip"></a>
-## Installing with native pip
-
-You may install TensorFlow through pip, choosing between a simple
-installation procedure or a more complex one.
-
-**Note:** The
-[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
-lists the TensorFlow packages that pip will install or upgrade.
-
-
-### Prerequisite: Python and Pip
-
-Python is automatically installed on Ubuntu.  Take a moment to confirm
-(by issuing a `python -V` command) that one of the following Python
-versions is already installed on your system:
-
-  * Python 2.7
-  * Python 3.4+
-
-The pip or pip3 package manager is *usually* installed on Ubuntu.  Take a
-moment to confirm (by issuing a `pip -V` or `pip3 -V` command)
-that pip or pip3 is installed.  We strongly recommend version 8.1 or higher
-of pip or pip3.  If Version 8.1 or later is not installed, issue the
-following command, which will either install or upgrade to the latest
-pip version:
-
-<pre>$ <b>sudo apt-get install python-pip python-dev</b>   # for Python 2.7
-$ <b>sudo apt-get install python3-pip python3-dev</b> # for Python 3.n
-</pre>
-
-
-### Install TensorFlow
-
-Assuming the prerequisite software is installed on your Linux host,
-take the following steps:
-
-  1. Install TensorFlow by invoking **one** of the following commands:
-
-     <pre>$ <b>pip install tensorflow</b>      # Python 2.7; CPU support (no GPU support)
-    $ <b>pip3 install tensorflow</b>     # Python 3.n; CPU support (no GPU support)
-    $ <b>pip install tensorflow-gpu</b>  # Python 2.7;  GPU support
-    $ <b>pip3 install tensorflow-gpu</b> # Python 3.n; GPU support </pre>
-
-     If the preceding command runs to completion, you should now
-     [validate your installation](#ValidateYourInstallation).
-
-  2. (Optional.) If Step 1 failed, install the latest version of TensorFlow
-     by issuing a command of the following format:
-
-     <pre>$ <b>sudo pip  install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
-    $ <b>sudo pip3 install --upgrade</b> <i>tfBinaryURL</i>   # Python 3.n </pre>
-
-     where <code><em>tfBinaryURL</em></code> identifies the URL of the
-     TensorFlow Python package. The appropriate value of
-     <code><em>tfBinaryURL</em></code> depends on the operating system,
-     Python version, and GPU support. Find the appropriate value for
-     <code><em>tfBinaryURL</em></code>
-     [here](#the_url_of_the_tensorflow_python_package).  For example, to
-     install TensorFlow for Linux, Python 3.4, and CPU-only support, issue
-     the following command:
-
-     <pre>
-     $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b>
-     </pre>
-
-     If this step fails, see
-     [Common Installation Problems](#common_installation_problems).
-
-
-### Next Steps
-
-After installing TensorFlow, [validate your installation](#ValidateYourInstallation).
-
-
-### Uninstalling TensorFlow
-
-To uninstall TensorFlow, issue one of following commands:
-
-<pre>
-$ <b>sudo pip uninstall tensorflow</b>  # for Python 2.7
-$ <b>sudo pip3 uninstall tensorflow</b> # for Python 3.n
-</pre>
-
-
-<a name="InstallingDocker"></a>
-## Installing with Docker
-
 Take the following steps to install TensorFlow through Docker:

  1. Install Docker on your machine as described in the
@ -364,7 +287,7 @@ Take the following steps to install TensorFlow through Docker:
 The remainder of this section explains how to launch a Docker container.


-### CPU-only
+#### CPU-only

 To launch a Docker container with CPU-only support (that is, without
 GPU support), enter a command of the following format:
@ -414,7 +337,7 @@ $ <b>docker run -it -p 8888:8888 tensorflow/tensorflow</b>
 Docker will download the TensorFlow binary image the first time you launch it.


-### GPU support
+#### GPU support

 Prior to installing TensorFlow with GPU support, ensure that your system meets all
 [NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
@ -470,14 +393,22 @@ For more details see the
 [TensorFlow docker readme](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker).


-### Next Steps
+#### Next Steps

 You should now
 [validate your installation](#ValidateYourInstallation).


 <a name="InstallingAnaconda"></a>
-## Installing with Anaconda
+### Use `pip` in Anaconda
+
+Anaconda provides the `conda` utility to create a virtual environment. However,
+within Anaconda, we recommend installing TensorFlow using the `pip install`
+command and *not* with the `conda install` command.
+
+Caution: `conda` is a community supported package this is not officially
+maintained by the TensorFlow team. Use this package at your own risk since it is
+not tested on new TensorFlow releases.

 Take the following steps to install TensorFlow in an Anaconda environment:

@ -507,7 +438,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:

     <pre>
     (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>

 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@ -563,11 +494,89 @@ installation problems](#common_installation_problems).
 If you are new to machine learning, we recommend the following:

 *  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
-*  @{$get_started/get_started_for_beginners$Getting Started for ML Beginners}
+*  @{$get_started/eager}

 If you are experienced with machine learning but new to TensorFlow, see
@{$get_started/eager}.

+<a name="NVIDIARequirements"></a>
+## TensorFlow GPU support
+
+To install TensorFlow with GPU support, configure the following NVIDIA® software
+on your system:
+
+* [CUDA Toolkit 9.0](http://nvidia.com/cuda). For details, see
+  [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
+  Append the relevant CUDA pathnames to the `LD_LIBRARY_PATH` environmental
+  variable as described in the NVIDIA documentation.
+* [cuDNN SDK v7](http://developer.nvidia.com/cudnn). For details, see
+  [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/).
+  Create the `CUDA_HOME` environment variable as described in the NVIDIA
+  documentation.
+* A GPU card with CUDA Compute Capability 3.0 or higher for building TensorFlow
+  from source. To use the TensorFlow binaries, version 3.5 or higher is required.
+  See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
+  list of supported GPU cards.
+* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
+  Toolkit.
+* The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
+  library provides advanced profiling support. To install this library,
+  use the following command for CUDA Toolkit >= 8.0:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install cuda-command-line-tools</code>
+</pre>
+
+Add this path to the `LD_LIBRARY_PATH` environmental variable:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</code>
+</pre>
+
+For CUDA Toolkit <= 7.5 use:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-get install libcupti-dev</code>
+</pre>
+
+* *OPTIONAL*:  For optimized performance during inference, install
+  *NVIDIA&nbsp;TensorRT&nbsp;3.0*. To install the minimal amount of TensorRT
+  runtime components required to use with the pre-built `tensorflow-gpu` package:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</code>
+  <code class="devsite-terminal">sudo dpkg -i nvinfer-runtime-trt-repo-ubuntu1404-3.0.4-ga-cuda9.0_1.0-1_amd64.deb</code>
+  <code class="devsite-terminal">sudo apt-get update</code>
+  <code class="devsite-terminal">sudo apt-get install -y --allow-downgrades libnvinfer-dev libcudnn7-dev=7.0.5.15-1+cuda9.0 libcudnn7=7.0.5.15-1+cuda9.0</code>
+</pre>
+
+Note: For compatibility with the pre-built `tensorflow-gpu` package, use the
+Ubuntu *14.04* package of TensorRT (shown above). Use this even when installing
+on an Ubuntu 16.04 system.
+
+To build the TensorFlow-TensorRT integration module from source instead of using
+the pre-built binaries, see the
+[module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#using-tensorrt-in-tensorflow).
+For detailed TensorRT installation instructions, see
+[NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html).
+
+To avoid cuDNN version conflicts during later system upgrades, hold the cuDNN
+version at 7.0.5:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-mark hold libcudnn7 libcudnn7-dev</code>
+</pre>
+
+To allow upgrades, remove the this hold:
+
+<pre class="prettyprint lang-bsh">
+  <code class="devsite-terminal">sudo apt-mark unhold libcudnn7 libcudnn7-dev</code>
+</pre>
+
+If you have an earlier version of the preceding packages, upgrade to the
+specified versions. If upgrading is not possible, you can still run TensorFlow
+with GPU support by @{$install_sources}.
+

 ## Common installation problems

@ -581,7 +590,7 @@ ask a new question about it on Stack Overflow and specify
 the `tensorflow` tag.

 <table>
-<tr> <th>Stack Overflow Link</th> <th>Error Message</th> </tr>
+<tr> <th>Link to GitHub or Stack&nbsp;Overflow</th> <th>Error Message</th> </tr>

 <tr>
  <td><a href="https://stackoverflow.com/q/36159194">36159194</a></td>
@ -681,14 +690,14 @@ This section documents the relevant values for Linux installations.
 CPU only:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp27-none-linux_x86_64.whl
 </pre>


 GPU support:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp27-none-linux_x86_64.whl
 </pre>

 Note that GPU support requires the NVIDIA hardware and software described in
@ -700,14 +709,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>


 GPU support:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>

 Note that GPU support requires the NVIDIA hardware and software described in
@ -719,14 +728,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>


 GPU support:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>


@ -738,14 +747,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>


 GPU support:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>


--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
     TensorFlow in the active Virtualenv is as follows:

     <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b></pre>

 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@ -242,7 +242,7 @@ take the following steps:
     issue the following command:

     <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b> </pre>

     If the preceding command fails, see
     [installation problems](#common-installation-problems).
@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
     TensorFlow for Python 2.7:

     <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl</b></pre>


 <a name="ValidateYourInstallation"></a>
@ -524,7 +524,7 @@ The value you specify depends on your Python version.


 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl
 </pre>


@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py2-none-a


 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl
 </pre>
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@ -354,10 +354,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package

-for TensorFlow 1.8.0rc0 on Linux:
+for TensorFlow 1.8.0rc1 on Linux:

 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc1-py2-none-any.whl</b>
 </pre>

 ## Validate your installation
--- a/tensorflow/docs_src/performance/xla/tfcompile.md
+++ b/tensorflow/docs_src/performance/xla/tfcompile.md
@ -86,7 +86,7 @@ code. `tf_library` utilizes `tfcompile` to compile the TensorFlow graph into
 executable code.

 ```build
-load("//third_party/tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")

 # Use the tf_library macro to compile your graph into executable code.
 tf_library(
@ -258,8 +258,8 @@ file.

 ```build
 # Example of linking your binary
-# Also see //third_party/tensorflow/compiler/aot/tests/BUILD
-load("//third_party/tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+# Also see //tensorflow/compiler/aot/tests/BUILD
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")

 # The same tf_library call from step 2 above.
 tf_library(
--- a/tensorflow/examples/tutorials/estimators/init.py
+++ b/tensorflow/examples/tutorials/estimators/init.py
--- a/tensorflow/examples/tutorials/input_fn/init.py
+++ b/tensorflow/examples/tutorials/input_fn/init.py
--- a/tensorflow/examples/tutorials/layers/init.py
+++ b/tensorflow/examples/tutorials/layers/init.py
--- a/tensorflow/examples/tutorials/monitors/init.py
+++ b/tensorflow/examples/tutorials/monitors/init.py
--- a/tensorflow/examples/tutorials/monitors/iris_monitors.py
+++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py
@ -32,9 +32,9 @@ IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv")
 def main(unused_argv):
  # Load datasets.
  training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float)
+      filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
  test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float)
+      filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)

  validation_metrics = {
      "accuracy":
@ -83,7 +83,7 @@ def main(unused_argv):

  # Classify two new flower samples.
  new_samples = np.array(
-      [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
+      [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)
  y = list(classifier.predict(new_samples))
  print("Predictions: {}".format(str(y)))

--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@ -5,7 +5,7 @@ Construct and execute TensorFlow graphs in Go.
 [![GoDoc](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go?status.svg)](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)

 > *WARNING*: The API defined in this package is not stable and can change
-> without notice. The same goes for the awkward package path
+> without notice. The same goes for the package path:
 > (`github.com/tensorflow/tensorflow/tensorflow/go`).

 ## Quickstart
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@ -21386,7 +21386,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 //    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 //
 // The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 // Each element must be in the range `[0, 255]` (It represents the value of a
 // pixel in the output image).  Non-finite values in the input tensor are
 // replaced by this tensor in the output image.  The default value is the color
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@ -644,11 +644,9 @@ class Estimator(object):
              sharded=True)
          saver_for_restore.restore(session, checkpoint_path)

-          # pylint: disable=protected-access
          local_init_op = (
              estimator_spec.scaffold.local_init_op or
-              monitored_session.Scaffold._default_local_init_op())
-          # pylint: enable=protected-access
+              monitored_session.Scaffold.default_local_init_op())

          # Perform the export
          builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@ -29,12 +29,14 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import models
 from tensorflow.python.keras._impl.keras import optimizers
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
 from tensorflow.python.ops import variables as variables_module
@ -55,6 +57,17 @@ def _cast_tensor_to_floatx(x):
    return math_ops.cast(x, K.floatx())


+def _convert_tensor(x):
+  """Create or cast tensor if needed."""
+  if not tensor_util.is_tensor(x):
+    # x is a numpy array
+    x = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(x)
+  if check_ops.is_numeric_tensor(x):
+    # is_numeric_tensor returns False if provided with a numpy array
+    x = _cast_tensor_to_floatx(x)
+  return x
+
+
 def _any_variable_initalized():
  """Check if any variable has been initialized in the Keras model.

@ -86,7 +99,7 @@ def _create_ordered_io(keras_model, estimator_io, is_input=True):
  if isinstance(estimator_io, (list, tuple)):
    # Case currently not supported by most built-in input_fn,
    # but it's good to have for sanity
-    return [_cast_tensor_to_floatx(x) for x in estimator_io]
+    return [_convert_tensor(x) for x in estimator_io]
  elif isinstance(estimator_io, dict):
    if is_input:
      if keras_model._is_graph_network:
@ -108,12 +121,12 @@ def _create_ordered_io(keras_model, estimator_io, is_input=True):
            'It needs to match one '
            'of the following: %s' % ('input' if is_input else 'output', key,
                                      ', '.join(keras_io_names)))
-      tensors = [_cast_tensor_to_floatx(estimator_io[io_name])
+      tensors = [_convert_tensor(estimator_io[io_name])
                 for io_name in keras_io_names]
    return tensors
  else:
    # Plain array.
-    return _cast_tensor_to_floatx(estimator_io)
+    return _convert_tensor(estimator_io)


 def _in_place_subclassed_model_reset(model):
@ -274,8 +287,7 @@ def _clone_and_build_model(mode,
                                        is_input=False)
  else:
    target_tensors = [
-        _cast_tensor_to_floatx(
-            sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(labels))
+        _convert_tensor(labels)
    ]

  if keras_model._is_graph_network:
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@ -30,6 +30,7 @@ from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.applications import mobilenet
 from tensorflow.python.keras._impl.keras.optimizers import SGD
@ -142,16 +143,20 @@ def randomize_io_type(array, name):


 def multi_inputs_multi_outputs_model():
-  # test multi-input layer
  a = keras.layers.Input(shape=(16,), name='input_a')
  b = keras.layers.Input(shape=(16,), name='input_b')
+  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
  dense = keras.layers.Dense(8, name='dense_1')
+
  a_2 = dense(a)
+  # Apply a mask
+  s_2 = keras.layers.Lambda(lambda k:
+                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
  b_2 = dense(b)
-  merged = keras.layers.concatenate([a_2, b_2], name='merge')
+  merged = keras.layers.concatenate([s_2, b_2], name='merge')
  c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
  d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
-  model = keras.models.Model(inputs=[a, b], outputs=[c, d])
+  model = keras.models.Model(inputs=[a, b, m], outputs=[c, d])
  model.compile(
      loss='categorical_crossentropy',
      optimizer='rmsprop',
@ -352,18 +357,27 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
        test_samples=50,
        input_shape=(16,),
        num_classes=2)
+    np.random.seed(_RANDOM_SEED)
+    (input_m_train, _), (input_m_test, _) = testing_utils.get_test_data(
+        train_samples=_TRAIN_SIZE,
+        test_samples=50,
+        input_shape=(8,),
+        num_classes=2)
+
    c_train = keras.utils.to_categorical(c_train)
    c_test = keras.utils.to_categorical(c_test)
    d_train = keras.utils.to_categorical(d_train)
    d_test = keras.utils.to_categorical(d_test)

    def train_input_fn():
-      input_dict = {'input_a': a_train, 'input_b': b_train}
+      input_dict = {'input_a': a_train, 'input_b': b_train,
+                    'input_m': input_m_train > 0}
      output_dict = {'dense_2': c_train, 'dense_3': d_train}
      return input_dict, output_dict

    def eval_input_fn():
-      input_dict = {'input_a': a_test, 'input_b': b_test}
+      input_dict = {'input_a': a_test, 'input_b': b_test,
+                    'input_m': input_m_test > 0}
      output_dict = {'dense_2': c_test, 'dense_3': d_test}
      return input_dict, output_dict

--- a/tensorflow/python/kernel_tests/division_past_test.py
+++ b/tensorflow/python/kernel_tests/division_past_test.py
@ -35,8 +35,7 @@ class DivisionTestCase(test.TestCase):
    """Test all the different ways to divide."""
    values = [1, 2, 7, 11]
    functions = (lambda x: x), constant_op.constant
-    # TODO(irving): Test int8, int16 once we support casts for those.
-    dtypes = np.int32, np.int64, np.float32, np.float64
+    dtypes = np.int8, np.int16, np.int32, np.int64, np.float32, np.float64

    tensors = []
    checks = []
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@ -160,7 +160,7 @@ class ReduceJoinTest(UnicodeTestCase):
            separator=separator)
      if not reduction_indices:
        truth = constant_op.constant(truth)
-      truth_squeezed = array_ops.squeeze(truth, squeeze_dims=reduction_indices)
+      truth_squeezed = array_ops.squeeze(truth, axis=reduction_indices)
      output_array = output.eval()
      output_keep_dims_array = output_keep_dims.eval()
      truth_array = truth.eval()
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@ -889,9 +889,9 @@ class AnyReductionTest(test.TestCase):

 class CountNonzeroReductionTest(test.TestCase):

-  def _compare(self, x, reduction_axes, keepdims, use_gpu=False,
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False, zero=0,
               feed_dict=None):
-    np_ans = (x != 0).astype(np.int32)
+    np_ans = (x != zero).astype(np.int32)
    if reduction_axes is None:
      np_ans = np.sum(np_ans, keepdims=keepdims)
    else:
@ -958,6 +958,37 @@ class CountNonzeroReductionTest(test.TestCase):
          y = math_ops.count_nonzero(x, [0])
          self.assertAllEqual(y.eval(), np.zeros(9938))

+  def testStringReduce(self):
+    # Test case for GitHub issue 18712
+    with self.test_session() as sess:
+      v = math_ops.count_nonzero(constant_op.constant(["test"]))
+      self.assertAllClose(sess.run(v), 1)
+
+  def testStringReduce1D(self):
+    # Create a 1D array of strings
+    x = np.asarray(["", "", "a", "", "", "b"])
+    self._compare(x, None, keepdims=False, zero=np.str(""))
+    self._compare(x, [], keepdims=False, zero=np.str(""))
+    self._compare(x, [0], keepdims=False, zero=np.str(""))
+    self._compare(x, None, keepdims=True, zero=np.str(""))
+    self._compare(x, [], keepdims=True, zero=np.str(""))
+    self._compare(x, [0], keepdims=True, zero=np.str(""))
+
+  def testStringReduce2D(self):
+    # Create a 2D array of strings
+    x = np.asarray([["", "", "a", "", "", "b"],
+                    ["", "c", "", "d", "", ""],
+                    ["e", "", "f", "", "", ""]])
+    self._compare(x, None, keepdims=False, zero=np.str(""))
+    self._compare(x, [], keepdims=False, zero=np.str(""))
+    self._compare(x, [0], keepdims=False, zero=np.str(""))
+    self._compare(x, [1], keepdims=False, zero=np.str(""))
+    self._compare(x, [0, 1], keepdims=False, zero=np.str(""))
+    self._compare(x, None, keepdims=True, zero=np.str(""))
+    self._compare(x, [], keepdims=True, zero=np.str(""))
+    self._compare(x, [0], keepdims=True, zero=np.str(""))
+    self._compare(x, [0, 1], keepdims=True, zero=np.str(""))
+

 if __name__ == "__main__":
  test.main()
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@ -364,6 +364,42 @@ class ScatterNdTest(test.TestCase):
    del input_  # input_ is not used in scatter_nd
    return array_ops.scatter_nd(indices, updates, shape)

+  def testString(self):
+    indices = constant_op.constant([[4], [3], [1], [7]],
+                                   dtype=dtypes.int32)
+    updates = constant_op.constant(["four", "three", "one", "seven"],
+                                   dtype=dtypes.string)
+    expected = np.array([b"", b"one", b"", b"three", b"four",
+                         b"", b"", b"seven"])
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertAllEqual(expected, result)
+
+    # Same indice is updated twice by same value.
+    indices = constant_op.constant([[4], [3], [3], [7]],
+                                   dtype=dtypes.int32)
+    updates = constant_op.constant(["a", "b", "b", "c"],
+                                   dtype=dtypes.string)
+    expected = np.array([b"", b"", b"", b"bb", b"a", b"", b"", b"c"])
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertAllEqual(expected, result)
+
+    # Same indice is updated twice by different value.
+    indices = constant_op.constant([[4], [3], [3], [7]],
+                                   dtype=dtypes.int32)
+    updates = constant_op.constant(["a", "b", "c", "d"],
+                                   dtype=dtypes.string)
+    expected = [np.array([b"", b"", b"", b"bc", b"a", b"", b"", b"d"]),
+                np.array([b"", b"", b"", b"cb", b"a", b"", b"", b"d"])]
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+    with self.test_session() as sess:
+      result = sess.run(scatter)
+      self.assertTrue(np.array_equal(result, expected[0]) or
+                      np.array_equal(result, expected[1]))
+
  def testRank3ValidShape(self):
    indices = array_ops.zeros([2, 2, 2], dtypes.int32)
    updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@ -584,6 +620,10 @@ class ScatterNdNonAliasingAddTest(ScatterNdTest):
        shape, dtype=updates.dtype))
    return array_ops.scatter_nd_non_aliasing_add(input_, indices, updates)

+  def testString(self):
+    # Not supported yet.
+    pass
+

 if __name__ == "__main__":
  test.main()
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@ -196,7 +196,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
            array_ops.where(
                math_ops.logical_and(grad.indices >= start,
                                     grad.indices < end)),
-            squeeze_dims=[1])
+            axis=[1])
        new_indices = array_ops.gather(grad.indices, indices_to_select) - start
        new_values = array_ops.gather(grad.values, indices_to_select)
        out_grads.append(ops.IndexedSlices(new_values, new_indices, size))
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@ -994,9 +994,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
    `value[:, i, :, :]` and each tensor in `output` will have shape `(A, C, D)`.
  Etc.

-  This is the opposite of stack.  The numpy equivalent is
-
-      tf.unstack(x, n) = np.unstack(x)
+  This is the opposite of stack.

  Args:
    value: A rank `R > 0` `Tensor` to be unstacked.
@ -1720,8 +1718,10 @@ def placeholder(dtype, shape=None, name=None):
    print(sess.run(y, feed_dict={x: rand_array}))  # Will succeed.
  ```

-  @compatibility{eager} Placeholders are not compatible with eager execution.
-
+  @compatibility(eager)
+  Placeholders are not compatible with eager execution.
+  @end_compatibility
+  
  Args:
    dtype: The type of elements in the tensor to be fed.
    shape: The shape of the tensor to be fed (optional). If the shape is not
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@ -652,7 +652,7 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
    padded.set_shape(padded_shape)

    if not is_batch:
-      padded = array_ops.squeeze(padded, squeeze_dims=[0])
+      padded = array_ops.squeeze(padded, axis=[0])

    return padded

@ -732,7 +732,7 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
    cropped.set_shape(cropped_shape)

    if not is_batch:
-      cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
+      cropped = array_ops.squeeze(cropped, axis=[0])

    return cropped

@ -849,7 +849,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
    resized = control_flow_ops.with_dependencies(assert_ops, resized)

    if not is_batch:
-      resized = array_ops.squeeze(resized, squeeze_dims=[0])
+      resized = array_ops.squeeze(resized, axis=[0])

    return resized

@ -942,7 +942,7 @@ def resize_images(images,
           for x in [new_width_const, width, new_height_const, height]) and (
               width == new_width_const and height == new_height_const):
      if not is_batch:
-        images = array_ops.squeeze(images, squeeze_dims=[0])
+        images = array_ops.squeeze(images, axis=[0])
      return images

    if method == ResizeMethod.BILINEAR:
@ -965,7 +965,7 @@ def resize_images(images,
    images.set_shape([None, new_height_const, new_width_const, None])

    if not is_batch:
-      images = array_ops.squeeze(images, squeeze_dims=[0])
+      images = array_ops.squeeze(images, axis=[0])
    return images


--- a/Show More
+++ b/Show More