Merge changes from github.

PiperOrigin-RevId: 186674197
2018-02-22 14:24:57 -08:00 · 2018-02-22 14:24:57 -08:00 · dce9a49c19
commit dce9a49c19
parent cb7e1963c6
148 changed files with 7985 additions and 710 deletions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -21,7 +21,7 @@ newcomers.
 * Other:
  * Add `tf.contrib.distributions.Kumaraswamy`.
  * `RetryingFileSystem::FlushCaches()` calls the base FileSystem's `FlushCaches()`.
-  * Add auto_correlation to distributions.
+  * Add `auto_correlation` to distributions.
  * Add `tf.contrib.distributions.Autoregressive`.
  * Add SeparableConv1D layer.
  * Add convolutional Flipout layers.
@ -31,12 +31,12 @@ newcomers.
  * Output variance over trees predictions for classifications tasks.
  * For `pt` and `eval` commands, allow writing tensor values to filesystem as numpy files.
  * gRPC: Propagate truncated errors (instead of returning gRPC internal error).
-  * Augment parallel_interleave to support 2 kinds of prefetching.
+  * Augment `parallel_interleave` to support 2 kinds of prefetching.
  * Improved XLA support for C64-related ops log, pow, atan2, tanh.
  * Add probabilistic convolutional layers.

 ## API Changes
-* Introducing prepare_variance boolean with default setting to False for backward compatibility.
+* Introducing `prepare_variance` boolean with default setting to False for backward compatibility.
 * Move `layers_dense_variational_impl.py` to `layers_dense_variational.py`.

 ## Known Bugs
@ -96,27 +96,6 @@ Yoni Tsafir, yordun, Yuan (Terry) Tang, Yuxin Wu, zhengdi, Zhengsheng Wei, 田
 * Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
  This may break TF on older CPUs.

-## Known Bugs
-* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
-  `CUDA_ILLEGAL_ADDRESS` failures.
-
-  Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9
-  and CUDA 9.1 sometimes does not properly compute the carry bit when
-  decomposing 64-bit address calculations with large offsets (e.g. `load [x +
-  large_constant]`) into 32-bit arithmetic in SASS.
-
-  As a result, these versions of `ptxas` miscompile most XLA programs which use
-  more than 4GB of temp memory.  This results in garbage results and/or
-  `CUDA_ERROR_ILLEGAL_ADDRESS` failures.
-
-  A fix in CUDA 9.1.121 is expected in late February 2018.  We do not expect a
-  fix for CUDA 9.0.x.  Until the fix is available, the only workaround is to
-  [downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x
-  or disable XLA:GPU.
-
-  TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
-  CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
-
 ## Major Features And Improvements
 * [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager)
  preview version is now available.
--- a/configure.py
+++ b/configure.py
@ -445,7 +445,7 @@ def convert_version_to_int(version):


 def check_bazel_version(min_version):
-  """Check installed bezel version is at least min_version.
+  """Check installed bazel version is at least min_version.

  Args:
    min_version: string for minimum bazel version.
@ -1078,12 +1078,22 @@ def set_tf_tensorrt_install_path(environ_cp):
          break

    # Reset and Retry
-    print('Invalid path to TensorRT. None of the following files can be found:')
-    print(trt_install_path)
-    print(os.path.join(trt_install_path, 'lib'))
-    print(os.path.join(trt_install_path, 'lib64'))
-    if search_result:
-      print(libnvinfer_path_from_ldconfig)
+    if possible_files:
+      print('TensorRT libraries found in one the following directories',
+            'are not compatible with selected cuda and cudnn installations')
+      print(trt_install_path)
+      print(os.path.join(trt_install_path, 'lib'))
+      print(os.path.join(trt_install_path, 'lib64'))
+      if search_result:
+        print(libnvinfer_path_from_ldconfig)
+    else:
+      print(
+          'Invalid path to TensorRT. None of the following files can be found:')
+      print(trt_install_path)
+      print(os.path.join(trt_install_path, 'lib'))
+      print(os.path.join(trt_install_path, 'lib64'))
+      if search_result:
+        print(libnvinfer_path_from_ldconfig)

  else:
    raise UserInputError('Invalid TF_TENSORRT setting was provided %d '
@ -1481,7 +1491,6 @@ def main():
        'more details.')
  config_info_line('mkl', 'Build with MKL support.')
  config_info_line('monolithic', 'Config for mostly static monolithic build.')
-  config_info_line('tensorrt', 'Build with TensorRT support.')

 if __name__ == '__main__':
  main()
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@ -2081,7 +2081,7 @@ TEST_F(CApiAttributesTest, Tensor) {
 }

 TEST_F(CApiAttributesTest, StringTensor) {
-  // Create the string-Tensor "atttribute" value.
+  // Create the string-Tensor "attribute" value.
  char encoded[] = {
      0,   0, 0, 0, 0, 0, 0, 0,  // array[uint64] offsets
      1,                         // varint encoded string length
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@ -234,7 +234,8 @@ Status Literal::CopySliceFromInternal(
      int64 src_index = linear_index(src_literal.shape(), src_indexes);
      int64 dest_index = linear_index(shape(), dest_indexes);

-      StridedCopy(data<NativeT>(), dest_index, stride_config.dest_stride,
+      // `this->` is needed to workaround MSVC bug: #16882
+      StridedCopy(this->data<NativeT>(), dest_index, stride_config.dest_stride,
                  src_literal.data<NativeT>(), src_index,
                  stride_config.source_stride, stride_config.minor_loop_size);
      return true;
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@ -589,12 +589,9 @@ class HloInstruction {
    if (opcode() != other.opcode()) {
      return false;
    }
-    auto eq_shapes = layout_sensitive
-                         ? [](const Shape& a,
-                              const Shape& b) { return ShapeUtil::Equal(a, b); }
-                         : [](const Shape& a, const Shape& b) {
-                             return ShapeUtil::Compatible(a, b);
-                           };
+    using EqShapeFuncType = bool (*)(const Shape&, const Shape&);
+    EqShapeFuncType eq_shapes =
+        layout_sensitive ? ShapeUtil::Equal : ShapeUtil::Compatible;
    if (!eq_shapes(shape(), other.shape())) {
      return false;
    }
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@ -7,6 +7,7 @@ package(default_visibility = ["//tensorflow:__subpackages__"])

 load("//third_party/mpi:mpi.bzl", "if_mpi")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")

 py_library(
    name = "contrib_py",
@ -107,7 +108,9 @@ py_library(
        "//tensorflow/contrib/training:training_py",
        "//tensorflow/contrib/util:util_py",
        "//tensorflow/python:util",
-    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]),
+    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_tensorrt([
+        "//tensorflow/contrib/tensorrt:init_py",
+    ]),
 )

 cc_library(
--- a/tensorflow/contrib/android/README.md
+++ b/tensorflow/contrib/android/README.md
@ -81,6 +81,11 @@ For documentation on building a self-contained AAR file with cmake, see
 [tensorflow/contrib/android/cmake](cmake).


+### Makefile
+
+For documentation on building native TF libraries with make, including a CUDA-enabled variant for devices like the Nvidia Shield TV, see [tensorflow/contrib/makefile/README.md](../makefile/README.md)
+
+
 ## AssetManagerFileSystem

 This directory also contains a TensorFlow filesystem supporting the Android
--- a/tensorflow/contrib/boosted_trees/python/utils/losses.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@ -78,7 +78,7 @@ def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):

  # Calculate softmax probabilities for each class.
  unnormalized_probs = math_ops.exp(logits)
-  normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keep_dims=True)
+  normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keepdims=True)
  softmax_predictions = math_ops.divide(unnormalized_probs,
                                        math_ops.add(normalizers, eps))

@ -120,7 +120,7 @@ def per_example_squared_loss(labels, weights, predictions):
    update_op: An update operation to update the loss's internal state.
  """
  unweighted_loss = math_ops.reduce_sum(
-      math_ops.square(predictions - labels), 1, keep_dims=True)
+      math_ops.square(predictions - labels), 1, keepdims=True)

  return unweighted_loss * weights, control_flow_ops.no_op()

--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -52,6 +52,7 @@ if (NOT WIN32)
  # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
  find_package (Threads)

+  # Options for linking CUDA/CUDNN libraries
  option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
  option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
  if (NOT tensorflow_CUDNN_INCLUDE)
@ -73,6 +74,14 @@ if (NOT WIN32)
    # option's default value is OFF. Fill it with real default values
    set(tensorflow_CUDA_LIBRARY_PATH /usr/local/cuda/lib64)
  endif (NOT tensorflow_CUDA_LIBRARY_PATH)
+
+  # Options for linking other libraries
+  option(systemlib_ZLIB "Use the system installed library as shared objects instead of downloading ZLIB and statically linking to it: ZLIB" OFF)
+
+  option(systemlib_ALL "Turn on every possible systemlib_* options" OFF)
+  if (systemlib_ALL)
+    set (systmelib_ZLIB ON)
+  endif (systemlib_ALL)
 endif()

 if (WIN32)
@ -188,8 +197,10 @@ if (tensorflow_BUILD_CC_TESTS)
  include(googletest)
 endif()

+add_definitions(${ADD_CFLAGS})
+link_directories(${ADD_LINK_DIRECTORY})
+
 set(tensorflow_EXTERNAL_LIBRARIES
-    ${zlib_STATIC_LIBRARIES}
    ${gif_STATIC_LIBRARIES}
    ${png_STATIC_LIBRARIES}
    ${jpeg_STATIC_LIBRARIES}
@ -203,6 +214,15 @@ set(tensorflow_EXTERNAL_LIBRARIES
    ${re2_STATIC_LIBRARIES}
    ${sqlite_STATIC_LIBRARIES}
 )
+
+if (systemlib_ZLIB)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+      ${ZLIB_LIBRARIES})
+else (systemlib_ZLIB)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+    ${zlib_STATIC_LIBRARIES})
+endif (systemlib_ZLIB)
+
 set(tensorflow_EXTERNAL_DEPENDENCIES
    zlib_copy_headers_to_destination
    gif_copy_headers_to_destination
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@ -12,61 +12,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-include (ExternalProject)
+if (systemlib_ZLIB)
+  find_package(PkgConfig)
+  pkg_search_module(ZLIB REQUIRED zlib)
+  set(zlib_INCLUDE_DIR ${ZLIB_INCLUDE_DIRS})
+  set(ADD_LINK_DIRECTORY ${ADD_LINK_DIRECTORY} ${ZLIB_LIBRARY_DIRS})
+  set(ADD_CFLAGS ${ADD_CFLAGS} ${ZLIB_CFLAGS_OTHER})

-set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
-set(ZLIB_URL https://github.com/madler/zlib)
-set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
-set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
-set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
+  # To meet DEPENDS zlib from other projects.
+  # If we hit this line, zlib is already built and installed to the system.
+  add_custom_target(zlib)
+  add_custom_target(zlib_copy_headers_to_destination)

-if(WIN32)
-  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-    set(zlib_STATIC_LIBRARIES
-        debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
-        optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
-  else()
-    if(CMAKE_BUILD_TYPE EQUAL Debug)
+else (systemlib_ZLIB)
+  include (ExternalProject)
+
+  set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
+  set(ZLIB_URL https://github.com/madler/zlib)
+  set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
+  set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
+  set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
+
+  if(WIN32)
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
      set(zlib_STATIC_LIBRARIES
-          ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib)
+          debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
+          optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
    else()
-      set(zlib_STATIC_LIBRARIES
-          ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+      if(CMAKE_BUILD_TYPE EQUAL Debug)
+        set(zlib_STATIC_LIBRARIES
+            ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib)
+      else()
+        set(zlib_STATIC_LIBRARIES
+            ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+      endif()
    endif()
+  else()
+    set(zlib_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
  endif()
-else()
-  set(zlib_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
-endif()

-set(ZLIB_HEADERS
-    "${ZLIB_INSTALL}/include/zconf.h"
-    "${ZLIB_INSTALL}/include/zlib.h"
-)
+  set(ZLIB_HEADERS
+      "${ZLIB_INSTALL}/include/zconf.h"
+      "${ZLIB_INSTALL}/include/zlib.h"
+  )

-ExternalProject_Add(zlib
-    PREFIX zlib
-    GIT_REPOSITORY ${ZLIB_URL}
-    GIT_TAG ${ZLIB_TAG}
-    INSTALL_DIR ${ZLIB_INSTALL}
-    BUILD_IN_SOURCE 1
-    BUILD_BYPRODUCTS ${zlib_STATIC_LIBRARIES}
-    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
-    CMAKE_CACHE_ARGS
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
-        -DCMAKE_BUILD_TYPE:STRING=Release
-        -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
-)
+  ExternalProject_Add(zlib
+      PREFIX zlib
+      GIT_REPOSITORY ${ZLIB_URL}
+      GIT_TAG ${ZLIB_TAG}
+      INSTALL_DIR ${ZLIB_INSTALL}
+      BUILD_IN_SOURCE 1
+      BUILD_BYPRODUCTS ${zlib_STATIC_LIBRARIES}
+      DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      CMAKE_CACHE_ARGS
+          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
+          -DCMAKE_BUILD_TYPE:STRING=Release
+          -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
+  )

-# put zlib includes in the directory where they are expected
-add_custom_target(zlib_create_destination_dir
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${zlib_INCLUDE_DIR}
-    DEPENDS zlib)
+  # put zlib includes in the directory where they are expected
+  add_custom_target(zlib_create_destination_dir
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${zlib_INCLUDE_DIR}
+      DEPENDS zlib)

-add_custom_target(zlib_copy_headers_to_destination
-    DEPENDS zlib_create_destination_dir)
+  add_custom_target(zlib_copy_headers_to_destination
+      DEPENDS zlib_create_destination_dir)

-foreach(header_file ${ZLIB_HEADERS})
-    add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${zlib_INCLUDE_DIR})
-endforeach()
+  foreach(header_file ${ZLIB_HEADERS})
+      add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${zlib_INCLUDE_DIR})
+  endforeach()
+endif (systemlib_ZLIB)
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@ -413,6 +413,9 @@ tensorflow/contrib/tensorboard
 tensorflow/contrib/tensorboard/plugins
 tensorflow/contrib/tensorboard/plugins/projector
 tensorflow/contrib/tensorboard/plugins/trace
+# TODO(sami): Add cmake implementations.
+# tensorflow/contrib/tensorrt/python
+# tensorflow/contrib/tensorrt/python/ops
 tensorflow/contrib/tensor_forest
 tensorflow/contrib/tensor_forest/client
 tensorflow/contrib/tensor_forest/hybrid
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@ -105,8 +105,8 @@ def crf_sequence_score(inputs, tag_indices, sequence_lengths,
  return utils.smart_cond(
      pred=math_ops.equal(inputs.shape[1].value or array_ops.shape(inputs)[1],
                          1),
-      fn1=_single_seq_fn,
-      fn2=_multi_seq_fn)
+      true_fn=_single_seq_fn,
+      false_fn=_multi_seq_fn)


 def crf_log_norm(inputs, sequence_lengths, transition_params):
@ -511,7 +511,7 @@ def crf_decode(potentials, transition_params, sequence_length):
    return decode_tags, best_score

  return utils.smart_cond(
-      pred=math_ops.equal(
-          potentials.shape[1].value or array_ops.shape(potentials)[1], 1),
-      fn1=_single_seq_fn,
-      fn2=_multi_seq_fn)
+      pred=math_ops.equal(potentials.shape[1].value or
+                          array_ops.shape(potentials)[1], 1),
+      true_fn=_single_seq_fn,
+      false_fn=_multi_seq_fn)
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@ -278,7 +278,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
                      * math_ops.log(self.temperature))
    # compute the unnormalized density
    log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self._temperature_2d)
-    log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keep_dims=False)
+    log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keepdims=False)
    # combine unnormalized density with normalization constant
    log_prob = log_norm_const + log_unnorm_prob
    # Reshapes log_prob to be consistent with shape of user-supplied logits
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@ -22,11 +22,10 @@ to models defined without using eager execution.
 Eager execution is included in TensorFlow versions 1.5 and above.
 Installation instructions at https://www.tensorflow.org/install/

-The contents of this guide are compatible with TensorFlow 1.5.
-However, if you run into bugs that are fixed in source but not the
-release, you may want to either either [building from
-source](https://www.tensorflow.org/install/install_sources)
-or the try latest nightly builds. The nightly builds are available as:
+The contents of this guide are compatible with TensorFlow 1.5. However, if you
+run into bugs that are fixed in source but not the release, you may want to
+either [build from source](https://www.tensorflow.org/install/install_sources)
+or try a nightly build. The nightly builds are available as:

 - [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and

--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@ -192,11 +192,11 @@ class KMeans(object):
        # Computes Euclidean distance. Note the first and third terms are
        # broadcast additions.
        squared_distance = (
-            math_ops.reduce_sum(math_ops.square(inp), 1, keep_dims=True) -
+            math_ops.reduce_sum(math_ops.square(inp), 1, keepdims=True) -
            2 * math_ops.matmul(inp, clusters, transpose_b=True) +
            array_ops.transpose(
                math_ops.reduce_sum(
-                    math_ops.square(clusters), 1, keep_dims=True)))
+                    math_ops.square(clusters), 1, keepdims=True)))
        output.append(squared_distance)

    return output
--- a/tensorflow/contrib/framework/init.py
+++ b/tensorflow/contrib/framework/init.py
@ -85,6 +85,8 @@ See the @{$python/contrib.framework} guide.
@@py_func
@@sort

+@@get_placeholders
+
@@CriticalSection

@@BoundedTensorSpec
@ -102,10 +104,10 @@ from tensorflow.contrib.framework.python.ops import *

 from tensorflow.python.framework.ops import prepend_name_scope
 from tensorflow.python.framework.ops import strip_name_scope
-
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
-
+from tensorflow.python.ops.control_flow_ops import smart_cond
+from tensorflow.python.ops.control_flow_ops import smart_constant_value
 from tensorflow.python.util.all_util import remove_undocumented

 _allowed_symbols = ['nest']
--- a/tensorflow/contrib/framework/python/framework/graph_util.py
+++ b/tensorflow/contrib/framework/python/framework/graph_util.py
@ -133,6 +133,18 @@ def fuse_op(graph_def, input_nodes, output_nodes, output_dtypes,
 def get_placeholders(graph):
  """Get placeholders of a graph.

+  For example:
+
+  ```python
+  a = tf.placeholder(dtype=tf.float32, shape=[2, 2], name='a')
+  a = tf.placeholder(dtype=tf.int32, shape=[3, 2], name='b')
+
+  tf.contrib.framework.get_placeholders(tf.get_default_graph())
+  # Returns:
+  #  [<tf.Tensor 'a:0' shape=(2, 2) dtype=float32>,
+  #   <tf.Tensor 'b:0' shape=(3, 2) dtype=int32>]
+  ```
+
  Args:
    graph: A tf.Graph.
  Returns:
--- a/tensorflow/contrib/image/kernels/bipartite_match_op.cc
+++ b/tensorflow/contrib/image/kernels/bipartite_match_op.cc
@ -85,7 +85,7 @@ class BipartiteMatchOp : public OpKernel {
                   context->allocate_output(1, TensorShape({num_input_columns}),
                                            &column_to_row_match_indices));

-    typename TTypes<float, 2>::ConstTensor distance_mat =
+    TTypes<float, 2>::ConstTensor distance_mat =
        input_distance_mat.shaped<float, 2>(
            {num_input_rows, num_input_columns});

--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -517,8 +517,8 @@ def batch_norm(inputs,
      then the batch normalization uses weighted mean and
      variance. (This can be used to correct for bias in training
      example selection.)
-    fused: if `True`, use a faster, fused implementation if possible.
-      If `None`, use the system recommended implementation.
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.
    data_format: A string. `NHWC` (default) and `NCHW` are supported.
    zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new
      pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
@ -778,7 +778,7 @@ def batch_norm(inputs,
      else:
        if data_format == DATA_FORMAT_NCHW:
          mean, variance = nn.weighted_moments(
-              inputs, moments_axes, batch_weights, keep_dims=True)
+              inputs, moments_axes, batch_weights, keepdims=True)
          mean = array_ops.reshape(mean, [-1])
          variance = array_ops.reshape(variance, [-1])
        else:
@ -2836,9 +2836,9 @@ def spatial_softmax(features,

      softmax_attention = nn.softmax(features / temperature)
      expected_x = math_ops.reduce_sum(
-          pos_x * softmax_attention, [1], keep_dims=True)
+          pos_x * softmax_attention, [1], keepdims=True)
      expected_y = math_ops.reduce_sum(
-          pos_y * softmax_attention, [1], keep_dims=True)
+          pos_y * softmax_attention, [1], keepdims=True)
      expected_xy = array_ops.concat([expected_x, expected_y], 1)
      feature_keypoints = array_ops.reshape(expected_xy,
                                            [-1, num_channels.value * 2])
@ -3018,7 +3018,7 @@ def poincare_normalize(x, axis=1, epsilon=1e-5, name=None):
  """
  with ops.name_scope(name, 'poincare_normalize', [x]) as name:
    x = ops.convert_to_tensor(x, name='x')
-    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keep_dims=True)
+    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
    x_inv_norm = math_ops.rsqrt(square_sum)
    x_inv_norm = math_ops.minimum((1. - epsilon) * x_inv_norm, 1.)
    return math_ops.multiply(x, x_inv_norm, name=name)
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md
@ -6,7 +6,7 @@ TensorFlow Lite uses many techniques for achieving low latency like optimizing t
 ![image](g3doc/TFLite-Architecture.jpg)
 # Getting Started with an Android Demo App

-This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using a quantized Mobilenet model. A device running Android 5.0 ( API 21) or higher is required to run the demo.
+This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using either a quantized Mobilenet model or a floating point Inception-v3 model. A device running Android 5.0 ( API 21) or higher is required to run the demo.

 There are 3 ways to get the demo app to your device
 - Download the prebuilt binary or
@ -29,9 +29,16 @@ The simplest way to compile the demo app, and try out changes to the project cod
 - Make sure the Android SDK version is greater than 26 and NDK version is greater than 14 (in the Android Studio Settings).
 - Import the `tensorflow/contrib/lite/java/demo` directory as a new Android Studio project.
 - Click through installing all the Gradle extensions it requests.
- - Download the quantized Mobilenet TensorFlow Lite model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
-     - unzip and copy mobilenet_quant_v1_224.tflite to the assets directory:
-       `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
+ - Either
+     - Download the quantized Mobilenet TensorFlow Lite model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
+         - unzip and copy mobilenet_quant_v1_224.tflite to the assets directory:
+           `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
+     - Or download the floating point Inception-v3 model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
+         - unzip and copy inceptionv3_non_slim_2015.tflite to the assets directory
+         - change the chosen classifier in [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java) from
+         `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`
+         to
+         `classifier = new ImageClassifierFloatInception(getActivity());`
 - Build and run the demo app

 ## Building TensorFlow Lite and the demo app from source
@ -84,7 +91,7 @@ Currently, we only support building the Android demo app within a Python 2
 environment (due to a Bazel bug).

 ### More about the demo
-The demo is resizing each camera image frame to (224 width * 224 height) to match the  quantized Mobilenet model being used. The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch 224 * 224 is the width and height of the image 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. The Mobilenet model has 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The Mobilenet quantized model is bundled within the assets directory of the app.
+The demo is resizing each camera image frame to (224 width * 224 height) to match the quantized Mobilenet model being used (229 * 229 for Inception-v3). The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch. 224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. Both models have 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The model file must be downloaded and bundled within the assets directory of the app.

 # iOS Demo App

--- a/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels_imagenet_slim.txt
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels_imagenet_slim.txt
--- a/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@ -296,7 +296,8 @@ public class Camera2BasicFragment extends Fragment
  public void onActivityCreated(Bundle savedInstanceState) {
    super.onActivityCreated(savedInstanceState);
    try {
-      classifier = new ImageClassifier(getActivity());
+      // create either a new ImageClassifierQuantizedMobileNet or an ImageClassifierFloatInception
+      classifier = new ImageClassifierQuantizedMobileNet(getActivity());
    } catch (IOException e) {
      Log.e(TAG, "Failed to initialize an image classifier.");
    }
@ -658,8 +659,7 @@ public class Camera2BasicFragment extends Fragment
      showToast("Uninitialized Classifier or invalid context.");
      return;
    }
-    Bitmap bitmap =
-        textureView.getBitmap(ImageClassifier.DIM_IMG_SIZE_X, ImageClassifier.DIM_IMG_SIZE_Y);
+    Bitmap bitmap = textureView.getBitmap(classifier.getImageSizeX(), classifier.getImageSizeY());
    String textToShow = classifier.classifyFrame(bitmap);
    bitmap.recycle();
    showToast(textToShow);
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@ -37,17 +37,11 @@ import java.util.PriorityQueue;
 import org.tensorflow.lite.Interpreter;

 /** Classifies images with Tensorflow Lite. */
-public class ImageClassifier {
+public abstract class ImageClassifier {

  /** Tag for the {@link Log}. */
  private static final String TAG = "TfLiteCameraDemo";

-  /** Name of the model file stored in Assets. */
-  private static final String MODEL_PATH = "mobilenet_quant_v1_224.tflite";
-
-  /** Name of the label file stored in Assets. */
-  private static final String LABEL_PATH = "labels.txt";
-
  /** Number of results to show in the UI. */
  private static final int RESULTS_TO_SHOW = 3;

@ -56,23 +50,18 @@ public class ImageClassifier {

  private static final int DIM_PIXEL_SIZE = 3;

-  static final int DIM_IMG_SIZE_X = 224;
-  static final int DIM_IMG_SIZE_Y = 224;
-
  /* Preallocated buffers for storing image data in. */
-  private int[] intValues = new int[DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y];
+  private int[] intValues = new int[getImageSizeX() * getImageSizeY()];

  /** An instance of the driver class to run model inference with Tensorflow Lite. */
-  private Interpreter tflite;
+  protected Interpreter tflite;

  /** Labels corresponding to the output of the vision model. */
  private List<String> labelList;

  /** A ByteBuffer to hold image data, to be feed into Tensorflow Lite as inputs. */
-  private ByteBuffer imgData = null;
+  protected ByteBuffer imgData = null;

-  /** An array to hold inference results, to be feed into Tensorflow Lite as outputs. */
-  private byte[][] labelProbArray = null;
  /** multi-stage low pass filter * */
  private float[][] filterLabelProbArray = null;

@ -95,10 +84,13 @@ public class ImageClassifier {
    labelList = loadLabelList(activity);
    imgData =
        ByteBuffer.allocateDirect(
-            DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y * DIM_PIXEL_SIZE);
+            DIM_BATCH_SIZE
+                * getImageSizeX()
+                * getImageSizeY()
+                * DIM_PIXEL_SIZE
+                * getNumBytesPerChannel());
    imgData.order(ByteOrder.nativeOrder());
-    labelProbArray = new byte[1][labelList.size()];
-    filterLabelProbArray = new float[FILTER_STAGES][labelList.size()];
+    filterLabelProbArray = new float[FILTER_STAGES][getNumLabels()];
    Log.d(TAG, "Created a Tensorflow Lite Image Classifier.");
  }

@ -111,7 +103,7 @@ public class ImageClassifier {
    convertBitmapToByteBuffer(bitmap);
    // Here's where the magic happens!!!
    long startTime = SystemClock.uptimeMillis();
-    tflite.run(imgData, labelProbArray);
+    runInference();
    long endTime = SystemClock.uptimeMillis();
    Log.d(TAG, "Timecost to run model inference: " + Long.toString(endTime - startTime));

@ -125,12 +117,12 @@ public class ImageClassifier {
  }

  void applyFilter() {
-    int numLabels = labelList.size();
+    int numLabels = getNumLabels();

    // Low pass filter `labelProbArray` into the first stage of the filter.
    for (int j = 0; j < numLabels; ++j) {
      filterLabelProbArray[0][j] +=
-          FILTER_FACTOR * (labelProbArray[0][j] - filterLabelProbArray[0][j]);
+          FILTER_FACTOR * (getProbability(j) - filterLabelProbArray[0][j]);
    }
    // Low pass filter each stage into the next.
    for (int i = 1; i < FILTER_STAGES; ++i) {
@ -142,7 +134,7 @@ public class ImageClassifier {

    // Copy the last stage filter output back to `labelProbArray`.
    for (int j = 0; j < numLabels; ++j) {
-      labelProbArray[0][j] = (byte)filterLabelProbArray[FILTER_STAGES - 1][j];
+      setProbability(j, filterLabelProbArray[FILTER_STAGES - 1][j]);
    }
  }

@ -156,7 +148,7 @@ public class ImageClassifier {
  private List<String> loadLabelList(Activity activity) throws IOException {
    List<String> labelList = new ArrayList<String>();
    BufferedReader reader =
-        new BufferedReader(new InputStreamReader(activity.getAssets().open(LABEL_PATH)));
+        new BufferedReader(new InputStreamReader(activity.getAssets().open(getLabelPath())));
    String line;
    while ((line = reader.readLine()) != null) {
      labelList.add(line);
@ -167,7 +159,7 @@ public class ImageClassifier {

  /** Memory-map the model file in Assets. */
  private MappedByteBuffer loadModelFile(Activity activity) throws IOException {
-    AssetFileDescriptor fileDescriptor = activity.getAssets().openFd(MODEL_PATH);
+    AssetFileDescriptor fileDescriptor = activity.getAssets().openFd(getModelPath());
    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
    FileChannel fileChannel = inputStream.getChannel();
    long startOffset = fileDescriptor.getStartOffset();
@ -185,12 +177,10 @@ public class ImageClassifier {
    // Convert the image to floating point.
    int pixel = 0;
    long startTime = SystemClock.uptimeMillis();
-    for (int i = 0; i < DIM_IMG_SIZE_X; ++i) {
-      for (int j = 0; j < DIM_IMG_SIZE_Y; ++j) {
+    for (int i = 0; i < getImageSizeX(); ++i) {
+      for (int j = 0; j < getImageSizeY(); ++j) {
        final int val = intValues[pixel++];
-        imgData.put((byte) ((val >> 16) & 0xFF));
-        imgData.put((byte) ((val >> 8) & 0xFF));
-        imgData.put((byte) (val & 0xFF));
+        addPixelValue(val);
      }
    }
    long endTime = SystemClock.uptimeMillis();
@ -199,9 +189,9 @@ public class ImageClassifier {

  /** Prints top-K labels, to be shown in UI as the results. */
  private String printTopKLabels() {
-    for (int i = 0; i < labelList.size(); ++i) {
+    for (int i = 0; i < getNumLabels(); ++i) {
      sortedLabels.add(
-          new AbstractMap.SimpleEntry<>(labelList.get(i), (labelProbArray[0][i] & 0xff) / 255.0f));
+          new AbstractMap.SimpleEntry<>(labelList.get(i), getNormalizedProbability(i)));
      if (sortedLabels.size() > RESULTS_TO_SHOW) {
        sortedLabels.poll();
      }
@ -214,4 +204,89 @@ public class ImageClassifier {
    }
    return textToShow;
  }
+
+  /**
+   * Get the name of the model file stored in Assets.
+   *
+   * @return
+   */
+  protected abstract String getModelPath();
+
+  /**
+   * Get the name of the label file stored in Assets.
+   *
+   * @return
+   */
+  protected abstract String getLabelPath();
+
+  /**
+   * Get the image size along the x axis.
+   *
+   * @return
+   */
+  protected abstract int getImageSizeX();
+
+  /**
+   * Get the image size along the y axis.
+   *
+   * @return
+   */
+  protected abstract int getImageSizeY();
+
+  /**
+   * Get the number of bytes that is used to store a single color channel value.
+   *
+   * @return
+   */
+  protected abstract int getNumBytesPerChannel();
+
+  /**
+   * Add pixelValue to byteBuffer.
+   *
+   * @param pixelValue
+   */
+  protected abstract void addPixelValue(int pixelValue);
+
+  /**
+   * Read the probability value for the specified label This is either the original value as it was
+   * read from the net's output or the updated value after the filter was applied.
+   *
+   * @param labelIndex
+   * @return
+   */
+  protected abstract float getProbability(int labelIndex);
+
+  /**
+   * Set the probability value for the specified label.
+   *
+   * @param labelIndex
+   * @param value
+   */
+  protected abstract void setProbability(int labelIndex, Number value);
+
+  /**
+   * Get the normalized probability value for the specified label. This is the final value as it
+   * will be shown to the user.
+   *
+   * @return
+   */
+  protected abstract float getNormalizedProbability(int labelIndex);
+
+  /**
+   * Run inference using the prepared input in {@link #imgData}. Afterwards, the result will be
+   * provided by getProbability().
+   *
+   * <p>This additional method is necessary, because we don't have a common base for different
+   * primitive data types.
+   */
+  protected abstract void runInference();
+
+  /**
+   * Get the total number of labels.
+   *
+   * @return
+   */
+  protected int getNumLabels() {
+    return labelList.size();
+  }
 }
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
@ -0,0 +1,103 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import android.app.Activity;
+import java.io.IOException;
+
+/**
+ * This classifier works with the Inception-v3 slim model. It applies floating point inference
+ * rather than using a quantized model.
+ */
+public class ImageClassifierFloatInception extends ImageClassifier {
+
+  /** The inception net requires additional normalization of the used input. */
+  private static final int IMAGE_MEAN = 128;
+
+  private static final float IMAGE_STD = 128.0f;
+
+  /**
+   * An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
+   * of the super class, because we need a primitive array here.
+   */
+  private float[][] labelProbArray = null;
+
+  /**
+   * Initializes an {@code ImageClassifier}.
+   *
+   * @param activity
+   */
+  ImageClassifierFloatInception(Activity activity) throws IOException {
+    super(activity);
+    labelProbArray = new float[1][getNumLabels()];
+  }
+
+  @Override
+  protected String getModelPath() {
+    // you can download this file from
+    // https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip
+    return "inceptionv3_slim_2016.tflite";
+  }
+
+  @Override
+  protected String getLabelPath() {
+    return "labels_imagenet_slim.txt";
+  }
+
+  @Override
+  protected int getImageSizeX() {
+    return 299;
+  }
+
+  @Override
+  protected int getImageSizeY() {
+    return 299;
+  }
+
+  @Override
+  protected int getNumBytesPerChannel() {
+    // a 32bit float value requires 4 bytes
+    return 4;
+  }
+
+  @Override
+  protected void addPixelValue(int pixelValue) {
+    imgData.putFloat((((pixelValue >> 16) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+    imgData.putFloat((((pixelValue >> 8) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+    imgData.putFloat(((pixelValue & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
+  }
+
+  @Override
+  protected float getProbability(int labelIndex) {
+    return labelProbArray[0][labelIndex];
+  }
+
+  @Override
+  protected void setProbability(int labelIndex, Number value) {
+    labelProbArray[0][labelIndex] = value.floatValue();
+  }
+
+  @Override
+  protected float getNormalizedProbability(int labelIndex) {
+    // TODO the following value isn't in [0,1] yet, but may be greater. Why?
+    return getProbability(labelIndex);
+  }
+
+  @Override
+  protected void runInference() {
+    tflite.run(imgData, labelProbArray);
+  }
+}
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import android.app.Activity;
+import java.io.IOException;
+
+/** This classifier works with the quantized MobileNet model. */
+public class ImageClassifierQuantizedMobileNet extends ImageClassifier {
+
+  /**
+   * An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
+   * of the super class, because we need a primitive array here.
+   */
+  private byte[][] labelProbArray = null;
+
+  /**
+   * Initializes an {@code ImageClassifier}.
+   *
+   * @param activity
+   */
+  ImageClassifierQuantizedMobileNet(Activity activity) throws IOException {
+    super(activity);
+    labelProbArray = new byte[1][getNumLabels()];
+  }
+
+  @Override
+  protected String getModelPath() {
+    // you can download this file from
+    // https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip
+    return "mobilenet_quant_v1_224.tflite";
+  }
+
+  @Override
+  protected String getLabelPath() {
+    return "labels_mobilenet_quant_v1_224.txt";
+  }
+
+  @Override
+  protected int getImageSizeX() {
+    return 224;
+  }
+
+  @Override
+  protected int getImageSizeY() {
+    return 224;
+  }
+
+  @Override
+  protected int getNumBytesPerChannel() {
+    // the quantized model uses a single byte only
+    return 1;
+  }
+
+  @Override
+  protected void addPixelValue(int pixelValue) {
+    imgData.put((byte) ((pixelValue >> 16) & 0xFF));
+    imgData.put((byte) ((pixelValue >> 8) & 0xFF));
+    imgData.put((byte) (pixelValue & 0xFF));
+  }
+
+  @Override
+  protected float getProbability(int labelIndex) {
+    return labelProbArray[0][labelIndex];
+  }
+
+  @Override
+  protected void setProbability(int labelIndex, Number value) {
+    labelProbArray[0][labelIndex] = value.byteValue();
+  }
+
+  @Override
+  protected float getNormalizedProbability(int labelIndex) {
+    return (labelProbArray[0][labelIndex] & 0xff) / 255.0f;
+  }
+
+  @Override
+  protected void runInference() {
+    tflite.run(imgData, labelProbArray);
+  }
+}
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@ -50,16 +50,12 @@ def pairwise_distance(feature, squared=False):
    pairwise_distances: 2-D Tensor of size [number of data, number of data].
  """
  pairwise_distances_squared = math_ops.add(
+      math_ops.reduce_sum(math_ops.square(feature), axis=[1], keepdims=True),
      math_ops.reduce_sum(
-          math_ops.square(feature),
-          axis=[1],
-          keep_dims=True),
-      math_ops.reduce_sum(
-          math_ops.square(
-              array_ops.transpose(feature)),
+          math_ops.square(array_ops.transpose(feature)),
          axis=[0],
-          keep_dims=True)) - 2.0 * math_ops.matmul(
-              feature, array_ops.transpose(feature))
+          keepdims=True)) - 2.0 * math_ops.matmul(feature,
+                                                  array_ops.transpose(feature))

  # Deal with numerical inaccuracies. Set small negatives to zero.
  pairwise_distances_squared = math_ops.maximum(pairwise_distances_squared, 0.0)
@ -132,10 +128,10 @@ def masked_maximum(data, mask, dim=1):
    masked_maximums: N-D `Tensor`.
      The maximized dimension is of size 1 after the operation.
  """
-  axis_minimums = math_ops.reduce_min(data, dim, keep_dims=True)
+  axis_minimums = math_ops.reduce_min(data, dim, keepdims=True)
  masked_maximums = math_ops.reduce_max(
-      math_ops.multiply(
-          data - axis_minimums, mask), dim, keep_dims=True) + axis_minimums
+      math_ops.multiply(data - axis_minimums, mask), dim,
+      keepdims=True) + axis_minimums
  return masked_maximums


@ -151,10 +147,10 @@ def masked_minimum(data, mask, dim=1):
    masked_minimums: N-D `Tensor`.
      The minimized dimension is of size 1 after the operation.
  """
-  axis_maximums = math_ops.reduce_max(data, dim, keep_dims=True)
+  axis_maximums = math_ops.reduce_max(data, dim, keepdims=True)
  masked_minimums = math_ops.reduce_min(
-      math_ops.multiply(
-          data - axis_maximums, mask), dim, keep_dims=True) + axis_maximums
+      math_ops.multiply(data - axis_maximums, mask), dim,
+      keepdims=True) + axis_maximums
  return masked_minimums


@ -202,8 +198,7 @@ def triplet_semihard_loss(labels, embeddings, margin=1.0):
  mask_final = array_ops.reshape(
      math_ops.greater(
          math_ops.reduce_sum(
-              math_ops.cast(
-                  mask, dtype=dtypes.float32), 1, keep_dims=True),
+              math_ops.cast(mask, dtype=dtypes.float32), 1, keepdims=True),
          0.0), [batch_size, batch_size])
  mask_final = array_ops.transpose(mask_final)

@ -290,7 +285,7 @@ def npairs_loss(labels, embeddings_anchor, embeddings_positive,

  labels_remapped = math_ops.to_float(
      math_ops.equal(labels, array_ops.transpose(labels)))
-  labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True)
+  labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)

  # Add the softmax loss.
  xent_loss = nn.softmax_cross_entropy_with_logits(
@ -395,7 +390,7 @@ def npairs_loss_multilabel(sparse_labels, embeddings_anchor,

    multilabel_adjacency_matrix = _build_multilabel_adjacency(sparse_labels)
    labels_remapped = math_ops.to_float(multilabel_adjacency_matrix)
-    labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True)
+    labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)

    # Add the softmax loss.
    xent_loss = nn.softmax_cross_entropy_with_logits(
@ -448,10 +443,10 @@ def lifted_struct_loss(labels, embeddings, margin=1.0):
  # Safe maximum: Temporarily shift negative distances
  #   above zero before taking max.
  #     this is to take the max only among negatives.
-  row_minimums = math_ops.reduce_min(diff, 1, keep_dims=True)
+  row_minimums = math_ops.reduce_min(diff, 1, keepdims=True)
  row_negative_maximums = math_ops.reduce_max(
-      math_ops.multiply(
-          diff - row_minimums, mask), 1, keep_dims=True) + row_minimums
+      math_ops.multiply(diff - row_minimums, mask), 1,
+      keepdims=True) + row_minimums

  # Compute the loss.
  # Keep track of matrix of maximums where M_ij = max(m_i, m_j)
@ -467,10 +462,11 @@ def lifted_struct_loss(labels, embeddings, margin=1.0):
      array_ops.transpose(max_elements), [-1, 1])

  loss_exp_left = array_ops.reshape(
-      math_ops.reduce_sum(math_ops.multiply(
-          math_ops.exp(
-              diff_tiled - max_elements_vect),
-          mask_tiled), 1, keep_dims=True), [batch_size, batch_size])
+      math_ops.reduce_sum(
+          math_ops.multiply(
+              math_ops.exp(diff_tiled - max_elements_vect), mask_tiled),
+          1,
+          keepdims=True), [batch_size, batch_size])

  loss_mat = max_elements + math_ops.log(
      loss_exp_left + array_ops.transpose(loss_exp_left))
@ -686,7 +682,7 @@ def _find_loss_augmented_facility_idx(pairwise_distances, labels, chosen_ids,
                  array_ops.reshape(pairwise_distances_candidate, [1, -1])
              ], 0),
              axis=0,
-              keep_dims=True), [num_candidates, -1]),
+              keepdims=True), [num_candidates, -1]),
      axis=1)

  nmi_scores = array_ops.zeros([num_candidates])
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@ -130,6 +130,105 @@ adb shell '/data/local/tmp/benchmark \

 For more details, see the [benchmark documentation](../../tools/benchmark).

+## CUDA support for Tegra devices running Android (Nvidia Shield TV, etc)
+
+With the release of TF 1.6 and JetPack for Android 3.2 (currently pending), you can now build a version of TensorFlow for compatible devices according to the following instructions which will receive the full benefits of GPU acceleration.
+
+#### Environment setup:
+
+First, download and install JetPack for Android version 3.2 or greater from [Nvidia](https://developers.nvidia.com). Note that as of the TF 1.6 release the JetPack for Android 3.2 release is still pending, and regular JetPack for L4T will not work.
+
+```bash
+git clone https://github.com/tensorflow/tensorflow.git
+cd tensorflow
+JETPACK=$HOME/JetPack_Android_3.2
+TEGRA_LIBS="$JETPACK/cuDNN/aarch64/cuda/lib64/libcudnn.so  $JETPACK/cuda-9.0/extras/CUPTI/lib64/libcupti.so $JETPACK/cuda/targets/aarch64-linux-androideabi/lib64/libcufft.so"
+```
+
+#### Building all CUDA-enabled native binaries:
+This will build CUDA-enabled versions of libtensorflow_inference.so and the benchmark binary. (libtensorflow_demo.so will also be built incidentally, but it does not support CUDA)
+
+```bash
+NDK_ROOT=$JETPACK/android-ndk-r13b
+CC_PREFIX=ccache tensorflow/contrib/makefile/build_all_android.sh -s tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in -t "libtensorflow_inference.so libtensorflow_demo.so all" -a tegra
+```
+(add -T on subsequent builds to skip protobuf downloading/building)
+
+
+#### Testing the CUDA-enabled benchmark via adb:
+Build binaries first as above, then run:
+
+```bash
+adb shell mkdir -p /data/local/tmp/lib64
+adb push $TEGRA_LIBS /data/local/tmp/lib64
+adb push tensorflow/contrib/makefile/gen/bin/android_arm64-v8a/benchmark /data/local/tmp
+wget  https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk
+unzip tensorflow_demo.apk -d /tmp/tensorflow_demo
+adb push /tmp/tensorflow_demo/assets/*.pb /data/local/tmp
+adb shell "LD_LIBRARY_PATH=/data/local/tmp/lib64 /data/local/tmp/benchmark --graph=/data/local/tmp/tensorflow_inception_graph.pb"
+```
+
+#### Building the CUDA-enabled TensorFlow AAR with Bazel:
+Build the native binaries first as above. Then, build the aar and package the native libs by executing the following:
+```bash
+mkdir -p /tmp/tf/jni/arm64-v8a
+cp tensorflow/contrib/makefile/gen/lib/android_tegra/libtensorflow_*.so /tmp/tf/jni/arm64-v8a/
+cp $TEGRA_LIBS /tmp/tf/jni/arm64-v8a
+bazel build //tensorflow/contrib/android:android_tensorflow_inference_java.aar
+cp bazel-bin/tensorflow/contrib/android/android_tensorflow_inference_java.aar /tmp/tf/tensorflow.aar
+cd /tmp/tf
+chmod +w tensorflow.aar
+zip -ur tensorflow.aar $(find jni -name *.so)
+```
+
+#### Building the CUDA-enabled TensorFlow Android demo with Bazel:
+Build binaries first as above, then edit tensorflow/examples/android/BUILD and replace: 
+```
+    srcs = [
+       ":libtensorflow_demo.so",
+       "//tensorflow/contrib/android:libtensorflow_inference.so",
+    ],
+```
+with:
+```
+srcs = glob(["libs/arm64-v8a/*.so"]),
+```
+
+Then run:
+```bash
+# Create dir for native libs
+mkdir -p tensorflow/examples/android/libs/arm64-v8a
+
+# Copy JetPack libs
+cp $TEGRA_LIBS  tensorflow/examples/android/libs/arm64-v8a
+
+# Copy native TensorFlow libraries
+cp tensorflow/contrib/makefile/gen/lib/android_arm64-v8a/libtensorflow_*.so tensorflow/examples/android/libs/arm64-v8a/
+
+# Build APK
+bazel build -c opt --fat_apk_cpu=arm64-v8a tensorflow/android:tensorflow_demo
+
+# Install
+adb install -r -f bazel-bin/tensorflow/examples/android/tensorflow_demo.apk 
+```
+
+#### Building the CUDA-enabled Android demo with gradle/Android Studio:
+
+Add tensorflow/examples/android as an Android project in Android Studio as normal.
+
+Edit build.gradle and:
+* set nativeBuildSystem = 'makefile'
+* set cpuType = 'arm64-v8a'
+* in "buildNativeMake", replace cpuType with 'tegra' (optional speedups like -T and ccache also work) 
+* set the environment "NDK_ROOT" var to $JETPACK/android-ndk-r13b
+
+Click "build apk" to build.
+
+Install:
+```bash
+adb install -r -f tensorflow/examples/android/gradleBuild/outputs/apk/debug/android-debug.apk
+```
+
 ## iOS

 _Note: To use this library in an iOS application, see related instructions in
--- a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+++ b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
@ -36,7 +36,7 @@ while getopts "bc:Eps" opt_name; do
    b) BUILD_ONLY="true";;
    c) TEST_COUNT="${OPTARG}";;
    E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
-    p) USE_PREBUILT_HEXAOGON_BINARIES="true";;
+    p) USE_PREBUILT_HEXAGON_BINARIES="true";;
    s) SKIP_DOWNLOAD_IF_EXIST="true";;
    *) usage;;
  esac
@ -49,7 +49,7 @@ if [[ -z "${NDK_ROOT}" ]]; then
    exit 1
 fi

-if [[ "${USE_PREBUILT_HEXAOGON_BINARIES}" != "true" &&
+if [[ "${USE_PREBUILT_HEXAGON_BINARIES}" != "true" &&
      -z "${QUALCOMM_SDK}" ]]; then
    echo "QUALCOMM_SDK is empty" 1>&2
    usage
@ -84,7 +84,7 @@ rm -rf "${GEN_DIR}"
 mkdir -p "${GEN_LIBS_DIR}"
 mkdir -p "${GEN_DOWNLOAD_DIR}"

-if [[ "${USE_PREBUILT_HEXAOGON_BINARIES}" == "true" ]]; then
+if [[ "${USE_PREBUILT_HEXAGON_BINARIES}" == "true" ]]; then
    echo "Download prebuilt hexagon binaries"
    if [[ "${BUILD_ONLY}" != "true" ]]; then
        CONTROLLER_PUSH_DEST="/data/local/tmp"
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@ -572,9 +572,8 @@ class LSTMBlockWrapper(base_layer.Layer):

  def _gather_states(self, data, indices, batch_size):
    """Produce `out`, s.t. out(i, j) = data(indices(i), i, j)."""
-    mod_indices = indices * batch_size + math_ops.range(batch_size)
-    return array_ops.gather(
-        array_ops.reshape(data, [-1, self.num_units]), mod_indices)
+    return array_ops.gather_nd(
+        data, array_ops.stack([indices, math_ops.range(batch_size)], axis=1))


 class LSTMBlockFusedCell(LSTMBlockWrapper):
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@ -424,8 +424,9 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
          "W_O_diag", shape=[self._num_units], dtype=dtype)

    # initialize the first freq state to be zero
-    m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]), self._num_units],
-                                  dtype)
+    m_prev_freq = array_ops.zeros(
+        [inputs.shape[0].value or inputs.get_shape()[0], self._num_units],
+        dtype)
    for fq in range(len(freq_inputs)):
      c_prev = array_ops.slice(state, [0, 2 * fq * self._num_units],
                               [-1, self._num_units])
--- a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
@ -80,12 +80,12 @@ class GatherTreeOp : public OpKernel {
                                max_sequence_lengths.shape().DebugString()));
    Tensor* beams;
    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, step_ids_shape, &beams));
-    typename TTypes<T, 3>::ConstTensor step_ids_t = step_ids.tensor<T, 3>();
-    typename TTypes<T, 3>::ConstTensor parent_ids_t = parent_ids.tensor<T, 3>();
+    typename TTypes<T, 3>::ConstTensor step_ids_t(step_ids.tensor<T, 3>());
+    typename TTypes<T, 3>::ConstTensor parent_ids_t(parent_ids.tensor<T, 3>());
    typename TTypes<int32>::ConstVec max_seq_lens_t =
        max_sequence_lengths.vec<int32>();
-    typename TTypes<T>::ConstScalar end_token_t = end_token.scalar<T>();
-    typename TTypes<T, 3>::Tensor beams_t = beams->tensor<T, 3>();
+    typename TTypes<T>::ConstScalar end_token_t(end_token.scalar<T>());
+    typename TTypes<T, 3>::Tensor beams_t(beams->tensor<T, 3>());
    const T end_token_value = end_token_t();
    functor::GatherTree<Device, T>()(ctx, device, step_ids_t, parent_ids_t,
                                     max_seq_lens_t, end_token_value, beams_t);
--- a/tensorflow/contrib/signal/python/ops/spectral_ops.py
+++ b/tensorflow/contrib/signal/python/ops/spectral_ops.py
@ -144,7 +144,7 @@ def inverse_stft_window_fn(frame_step,
      overlaps = -(-frame_length // frame_step)  # Ceiling division.
      denom = array_ops.pad(denom, [(0, overlaps * frame_step - frame_length)])
      denom = array_ops.reshape(denom, [overlaps, frame_step])
-      denom = math_ops.reduce_sum(denom, 0, keep_dims=True)
+      denom = math_ops.reduce_sum(denom, 0, keepdims=True)
      denom = array_ops.tile(denom, [overlaps, 1])
      denom = array_ops.reshape(denom, [overlaps * frame_step])

--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@ -29,6 +29,7 @@ from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
+from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.wrappers import hooks
 from tensorflow.python.framework import constant_op
@ -235,7 +236,7 @@ class SingleEvaluationTest(test.TestCase):
  def _prepareCheckpoint(self, checkpoint_path):
    init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                     variables.local_variables_initializer())
-    saver = saver_lib.Saver()
+    saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
    with self.test_session() as sess:
      sess.run(init_op)
      saver.save(sess, checkpoint_path)
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@ -497,6 +497,7 @@ py_library(
        ":tensor_forest_v4_ops_py",
        "//tensorflow/contrib/decision_trees/proto:generic_tree_model_py",
        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_py",
        "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_py",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:control_flow_ops",
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@ -1,5 +1,6 @@
 # Description:
-#   Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow.
+#   Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow
+#   and provide TensorRT operators and converter package.
 #   APIs are meant to change over time.

 package(default_visibility = ["//tensorflow:__subpackages__"])
@ -8,7 +9,19 @@ licenses(["notice"])  # Apache 2.0

 exports_files(["LICENSE"])

+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_copts",
+    "tf_cuda_library",
+    "tf_custom_op_library",
+    "tf_custom_op_library_additional_deps",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+)
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
    "@local_config_tensorrt//:build_defs.bzl",
    "if_tensorrt",
@ -32,6 +45,195 @@ tf_cuda_cc_test(
    ]),
 )

+tf_custom_op_library(
+    name = "python/ops/_trt_engine_op.so",
+    srcs = ["ops/trt_engine_op.cc"],
+    deps = [
+        ":trt_engine_op_kernel",
+        ":trt_shape_function",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_cuda_library(
+    name = "trt_shape_function",
+    srcs = ["shape_fn/trt_shfn.cc"],
+    hdrs = ["shape_fn/trt_shfn.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":trt_logging",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+cc_library(
+    name = "trt_engine_op_kernel",
+    srcs = ["kernels/trt_engine_op.cc"],
+    hdrs = ["kernels/trt_engine_op.h"],
+    copts = tf_copts(),
+    deps = [
+        ":trt_logging",
+        "//tensorflow/core:gpu_headers_lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:stream_executor_headers_lib",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]) + tf_custom_op_library_additional_deps(),
+    # TODO(laigd)
+    alwayslink = 1,  # buildozer: disable=alwayslink-with-hdrs
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["trt_engine_op"],
+    deps = if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_cuda_library(
+    name = "trt_logging",
+    srcs = ["log/trt_logger.cc"],
+    hdrs = ["log/trt_logger.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_gen_op_wrapper_py(
+    name = "trt_engine_op",
+    deps = [
+        ":trt_engine_op_op_lib",
+        ":trt_logging",
+        ":trt_shape_function",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "trt_engine_op_loader",
+    srcs = ["python/ops/trt_engine_op.py"],
+    dso = [
+        ":python/ops/_trt_engine_op.so",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resources",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = [
+        "__init__.py",
+        "python/__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":trt_convert_py",
+        ":trt_ops_py",
+    ],
+)
+
+py_library(
+    name = "trt_ops_py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":trt_engine_op",
+        ":trt_engine_op_loader",
+    ],
+)
+
+py_library(
+    name = "trt_convert_py",
+    srcs = ["python/trt_convert.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":wrap_conversion",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "wrap_conversion",
+    srcs = ["trt_conversion.i"],
+    copts = tf_copts(),
+    deps = [
+        ":trt_conversion",
+        "//tensorflow/core:framework_lite",
+        "//util/python:python_headers",
+    ],
+)
+
+# Library for the node-level conversion portion of TensorRT operation creation
+tf_cuda_library(
+    name = "trt_conversion",
+    srcs = [
+        "convert/convert_graph.cc",
+        "convert/convert_nodes.cc",
+    ],
+    hdrs = [
+        "convert/convert_graph.h",
+        "convert/convert_nodes.h",
+    ],
+    deps = [
+        ":segment",
+        ":trt_logging",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/optimizers:constant_folding",
+        "//tensorflow/core/grappler/optimizers:layout_optimizer",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+# Library for the segmenting portion of TensorRT operation creation
+cc_library(
+    name = "segment",
+    srcs = ["segment/segment.cc"],
+    hdrs = [
+        "segment/segment.h",
+        "segment/union_find.h",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+tf_cc_test(
+    name = "segment_test",
+    size = "small",
+    srcs = ["segment/segment_test.cc"],
+    deps = [
+        ":segment",
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 filegroup(
    name = "all_files",
    srcs = glob(
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@ -0,0 +1,40 @@
+Using TensorRT in TensorFlow
+============================
+
+This module provides necessary bindings and introduces TRT_engine_op
+operator that wraps a subgraph in TensorRT.
+
+Compilation
+-----------
+
+In order to compile the module, you need to have a local TensorRT
+installation (libnvinfer.so and respective include files). During the
+configuration step, TensorRT should be enabled and installation path
+should be set. If installed through package managers (deb,rpm),
+configure script should find the necessary components from the system
+automatically. If installed from tar packages, user has to set path to
+location where the library is installed during configuration.
+
+
+```
+bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
+bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
+```
+
+After the installation of tensorflow package, TensorRT transformation
+will be available. An example use is shown below.
+
+```python
+import tensorflow as tf
+import tensorflow.contrib.tensorrt as trt
+#... create and train or load model
+gdef = sess.graph.as_graph_def()
+trt_gdef = trt.create_inference_graph(
+    gdef, #original graph_def
+    ["output"], #name of output node(s)
+    max_batch_size, #maximum batch size to run the inference
+    max_workspace_size_bytes) # max memory for TensorRT to use
+tf.reset_default_graph()
+tf.import_graph_def(graph_def=trt_gdef)
+#...... run inference
+```
--- a/tensorflow/contrib/tensorrt/init.py
+++ b/tensorflow/contrib/tensorrt/init.py
@ -0,0 +1,23 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the python wrapper for TensorRT graph transforms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.tensorrt.python import *
+# pylint: enable=unused-import,wildcard-import
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@ -0,0 +1,273 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"  // NOLINT
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+namespace {
+
+static bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
+  // LINT.IfChange
+  // TODO(jie): Segmentation shouldn't associated with op name.
+  //            Split it into a registration for each kernel.
+  static const std::set<string> candidate_ops = {
+      "Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
+      "Add",      "Mul",   "Sub",    "Rsqrt",   "Pad"  // "Placeholder" ,"Mean"
+  };
+  // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
+  return candidate_ops.count(node_def.op());
+}
+
+void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
+                              const std::set<int>& subgraph_node_ids,
+                              tensorflow::EdgeSet* incoming_edges) {
+  for (int node_id : subgraph_node_ids) {
+    const tensorflow::Node* node = graph.FindNodeId(node_id);
+    for (const tensorflow::Edge* edge : node->in_edges()) {
+      if (!subgraph_node_ids.count(edge->src()->id()) &&
+          !edge->src()->IsSource()) {
+        incoming_edges->insert(edge);
+      }
+    }
+  }
+}
+
+void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
+                              const std::set<int>& subgraph_node_ids,
+                              tensorflow::EdgeSet* outgoing_edges) {
+  for (int node_id : subgraph_node_ids) {
+    const tensorflow::Node* node = graph.FindNodeId(node_id);
+    for (const tensorflow::Edge* edge : node->out_edges()) {
+      if (!subgraph_node_ids.count(edge->dst()->id()) &&
+          !edge->dst()->IsSink()) {
+        outgoing_edges->insert(edge);
+      }
+    }
+  }
+}
+
+std::pair<string, int> ParseTensorName(string name, int default_idx = 0) {
+  int idx = default_idx;
+  size_t sep = name.find_last_of(':');
+  if (sep != string::npos) {
+    name = name.substr(0, sep);
+    idx = std::stoi(name.substr(sep + 1));
+  }
+  return std::make_pair(name, idx);
+}
+
+std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
+    const std::vector<string>& tensor_names) {
+  std::unordered_map<string, std::vector<int>> result;
+  for (string const& tensor_name : tensor_names) {
+    string node_name;
+    int index;
+    std::tie(node_name, index) = ParseTensorName(tensor_name);
+    result[node_name].push_back(index);
+  }
+  return result;
+}
+
+tensorflow::Status ConvertSubGraphToTensorRT(
+    const std::vector<string>& output_names,
+    const std::set<int>& subgraph_node_ids,
+    size_t max_batch_size,  // Max batch size that engine will be created for
+    // Max amount of memory that engine will be allowed to consume, in bytes
+    size_t max_workspace_size_bytes,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    tensorflow::Graph* graph) {
+  tensorflow::EdgeSet subgraph_incoming_edges;
+  GetSubGraphIncomingEdges(*graph, subgraph_node_ids, &subgraph_incoming_edges);
+
+  std::vector<std::pair<int, int>> subgraph_inputs;
+
+  // Collect inputs by looking for incoming edges
+  for (const tensorflow::Edge* edge : subgraph_incoming_edges) {
+    subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
+  }
+  std::set<std::pair<int, int>> subgraph_outputs_set;
+  // Collect outputs referenced from output_names
+  auto output_name_to_index_map = BuildTensorNameMap(output_names);
+  for (int node_id : subgraph_node_ids) {
+    tensorflow::Node* node = graph->FindNodeId(node_id);
+    if (output_name_to_index_map.count(node->name())) {
+      for (int index : output_name_to_index_map.at(node->name())) {
+        subgraph_outputs_set.insert({node_id, index});
+      }
+    }
+  }
+  // Collect outputs referenced from outgoing edges
+  tensorflow::EdgeSet subgraph_outgoing_edges;
+  GetSubGraphOutgoingEdges(*graph, subgraph_node_ids, &subgraph_outgoing_edges);
+  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
+    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
+  }
+  // Impose an ordering on the outputs
+  std::vector<std::pair<int, int>> subgraph_outputs(
+      subgraph_outputs_set.begin(), subgraph_outputs_set.end());
+  // Build TensorRT node and add it to the graph
+  tensorflow::NodeDef trt_node_def;
+  TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(
+      *graph, subgraph_node_ids, subgraph_inputs, subgraph_outputs,
+      max_batch_size, max_workspace_size_bytes, graph_properties,
+      &trt_node_def));
+  tensorflow::Status status;
+  tensorflow::Node* trt_node = graph->AddNode(trt_node_def, &status);
+  TF_RETURN_IF_ERROR(status);
+
+  // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
+  std::map<std::pair<int, int>, int> subgraph_edge_to_output_map;
+  for (size_t i = 0; i < subgraph_outputs.size(); ++i) {
+    subgraph_edge_to_output_map.insert({subgraph_outputs.at(i), i});
+  }
+  TF_RETURN_IF_ERROR(status);
+  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
+    std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
+    int new_src_output = subgraph_edge_to_output_map.at(old_src);
+    TF_RETURN_IF_ERROR(graph->UpdateEdge(trt_node, new_src_output, edge->dst(),
+                                         edge->dst_input()));
+  }
+  // Remove the original subgraph
+  for (int node_id : subgraph_node_ids) {
+    tensorflow::Node* node = graph->FindNodeId(node_id);
+    // Don't remove the input placeholders
+    if (node->type_string() == "Placeholder") {
+      continue;
+    }
+    graph->RemoveNode(node);
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status BuildNodeMap(
+    const tensorflow::Graph& graph,
+    std::unordered_map<string, tensorflow::Node*>* node_map) {
+  for (auto* node : graph.op_nodes()) {
+    if (!node_map->insert({node->name(), node}).second) {
+      return tensorflow::errors::AlreadyExists(
+          "Node name is not unique in graph: " + node->name());
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
+
+tensorflow::Status ConvertGraphDefToTensorRT(
+    const tensorflow::GraphDef& graph_def,
+    const std::vector<string>& output_names, size_t max_batch_size,
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def) {
+  // Optimization pass
+  tensorflow::grappler::GrapplerItem item;
+  item.fetch = output_names;
+  tensorflow::GraphDef gdef;
+
+  // Layout optimization
+  item.graph = graph_def;
+  tensorflow::grappler::LayoutOptimizer optimizer;
+  tensorflow::grappler::Cluster* cluster;
+
+  // Virtual cluster
+  tensorflow::DeviceProperties device_properties;
+  device_properties.set_type("GPU");
+  device_properties.mutable_environment()->insert({"architecture", "6"});
+  cluster =
+      new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
+
+  TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
+
+  // Constant folding
+  item.graph = gdef;
+  tensorflow::grappler::ConstantFolding fold(nullptr);
+  TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
+
+  // AJ refactoring shape inference through grappler/GraphProperties.
+  tensorflow::grappler::GraphProperties static_graph_properties(item);
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(false));
+
+  // Build full graph
+  tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
+                                             gdef.library());
+  tensorflow::Graph graph(flib);
+  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
+      tensorflow::GraphConstructorOptions(), gdef, &graph));
+
+  // Segment the graph into subgraphs that can be converted to TensorRT
+  tensorflow::tensorrt::segment::SegmentOptions segment_options;
+
+  // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
+  for (auto node : output_names) {
+    segment_options.exclude_node_list.insert(node);
+  }
+
+  // TODO(sami): this should be passed as a knob!!!!
+  segment_options.minimum_segment_size = 2;
+  tensorflow::tensorrt::segment::SegmentNodesVector segments;
+  TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
+      gdef, IsTensorRTCandidate, segment_options, &segments));
+  if (segments.size() > 1) {
+    VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
+  }
+  std::unordered_map<string, tensorflow::Node*> node_map;
+  TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
+  for (const std::set<string>& subgraph_node_names : segments) {
+    std::set<int> subgraph_node_ids;
+    for (const string& node_name : subgraph_node_names) {
+      subgraph_node_ids.insert(node_map.at(node_name)->id());
+    }
+    TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
+        output_names, subgraph_node_ids, max_batch_size,
+        max_workspace_size_bytes, static_graph_properties, &graph));
+  }
+  graph.ToGraphDef(new_graph_def);
+  return tensorflow::Status::OK();
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+// max_batch_size: maximum batch size which can be used for inference for
+//                 optimization targets inference run with max batch size.
+// max_workspace_size_bytes: The upper bound of memory allowence for
+//                 engine building.
+tensorflow::Status ConvertGraphDefToTensorRT(
+    const tensorflow::GraphDef& graph_def,
+    const std::vector<string>& output_names, size_t max_batch_size,
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def);
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
+
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/lib/core/status.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
+    const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
+    const std::vector<std::pair<int, int>>&
+        input_inds,  // {node_id, output_idx}
+    const std::vector<std::pair<int, int>>&
+        output_inds,  // {node_id, output_idx}
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    const tensorflow::grappler::GraphProperties& graph_prop,
+    tensorflow::NodeDef* trt_node);
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
+
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+static ::tensorflow::tensorrt::Logger logger;
+
+TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
+  // read serialized_engine
+  string serialized_engine;
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("serialized_engine", &serialized_engine));
+
+  // register input output node name in trt_sub_graph
+  OP_REQUIRES_OK(context, context->GetAttr("input_nodes", &input_nodes_));
+  OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_));
+
+  // TODO(samikama) runtime should be taken from a resourcemanager as well.
+  // Only engine should be in the op and context and runtime should be taken
+  // from resourcemanager
+  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
+  trt_engine_ptr_.reset(infer->deserializeCudaEngine(
+      serialized_engine.c_str(), serialized_engine.size(), nullptr));
+
+  trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
+  // Runtime is safe to delete after engine creation
+  infer->destroy();
+}
+
+void TRTEngineOp::Compute(OpKernelContext* context) {
+  int num_binding = context->num_inputs() + context->num_outputs();
+  std::vector<void*> buffers(num_binding);
+
+  size_t binding_index;
+  int num_batch = 0;
+  bool valid = true;
+  for (int i = 0; i < context->num_inputs(); i++) {
+    // Grab the input tensor
+    binding_index = trt_engine_ptr_->getBindingIndex(input_nodes_[i].c_str());
+
+    const Tensor& input_tensor = context->input(i);
+    const TensorShape& input_shape = input_tensor.shape();
+    if (i == 0) {
+      num_batch = input_shape.dim_size(0);
+    } else if (num_batch != input_shape.dim_size(0)) {
+      valid = false;
+      break;
+    }
+    switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
+      case nvinfer1::DataType::kFLOAT:
+        buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
+        break;
+      case nvinfer1::DataType::kHALF:
+        LOG(FATAL) << "half size is not supported yet!";
+        break;
+      case nvinfer1::DataType::kINT8:
+        LOG(FATAL) << "int8 is not supported yet!";
+        break;
+    }
+  }
+
+  // Might want a different way to inform the user of batch size inconsistency
+  if (!valid) LOG(WARNING) << "input data inconsistent batch size";
+
+  for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) {
+    // This is bad that we have to reallocate output buffer every run.
+    // Create an output tensor
+    binding_index = trt_engine_ptr_->getBindingIndex(output_nodes_[i].c_str());
+    Tensor* output_tensor = nullptr;
+
+    TensorShape output_shape;
+    if (binding_index != -1) {
+      auto dims = trt_engine_ptr_->getBindingDimensions(binding_index);
+      std::vector<int> trt_shape(dims.nbDims + 1);
+      trt_shape[0] = num_batch;
+      for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
+      OP_REQUIRES_OK(context,
+                     TensorShapeUtils::MakeShape(
+                         trt_shape.data(), trt_shape.size(), &output_shape));
+    } else {
+      LOG(FATAL) << "output node not found, at " << output_nodes_[i];
+      break;
+    }
+
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(i, output_shape, &output_tensor));
+    switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
+      case nvinfer1::DataType::kFLOAT:
+        buffers[binding_index] =
+            reinterpret_cast<void*>(output_tensor->flat<float>().data());
+        break;
+      case nvinfer1::DataType::kHALF:
+        LOG(FATAL) << "half size is not supported yet!";
+        break;
+      case nvinfer1::DataType::kINT8:
+        LOG(FATAL) << "int8 is not supported yet!";
+        break;
+    }
+  }
+  // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
+  const cudaStream_t* stream = CHECK_NOTNULL(
+      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                ->stream()
+                                                ->implementation()
+                                                ->CudaStreamMemberHack()));
+
+  // execution handled by TF since we are getting stream from TF.
+  // it is safe for CPU pointer array (buffers) to go out of scope after enqueue
+  trt_execution_context_ptr_->enqueue(num_batch, &buffers[0], *stream, nullptr);
+}
+
+REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+class Logger;
+
+class TRTEngineOp : public OpKernel {
+ public:
+  explicit TRTEngineOp(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  template <typename T>
+  struct Destroyer {
+    void operator()(T* d) { d->destroy(); }
+  };
+
+  template <typename T>
+  using destroyed_ptr = std::unique_ptr<T, Destroyer<T>>;
+  destroyed_ptr<nvinfer1::ICudaEngine> trt_engine_ptr_;
+  // TODO(samikama): context should go to a resource manager!
+  destroyed_ptr<nvinfer1::IExecutionContext> trt_execution_context_ptr_;
+
+  std::vector<string> input_nodes_;
+  std::vector<string> output_nodes_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Use TF logging for TensorRT informations
+void Logger::log(Severity severity, const char* msg) {
+  // Suppress info-level messages
+  switch (severity) {
+    case Severity::kINFO: {  // Mark TRT info messages as debug!
+      VLOG(2) << msg;
+      break;
+    }
+    case Severity::kWARNING: {
+      LOG(WARNING) << msg;
+      break;
+    }
+    case Severity::kERROR: {
+      LOG(ERROR) << msg;
+      break;
+    }
+    case Severity::kINTERNAL_ERROR: {
+      LOG(FATAL) << msg;
+      break;
+    }
+    // This is useless for now. But would catch it in future if enum changes. It
+    // is always good to have default case!
+    default: {
+      LOG(FATAL) << name_ << "Got unknown severity level from TRT " << msg;
+      break;
+    }
+  }
+}
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
+
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Logger for GIE info/warning/errors
+class Logger : public nvinfer1::ILogger {
+ private:
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
+
+  string name_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+namespace shape_inference {
+extern Status TRTEngineOpShapeInference(InferenceContext* c);
+}
+
+REGISTER_OP("TRTEngineOp")
+    .Attr("serialized_engine: string")
+    .Attr("input_nodes: list(string)")
+    .Attr("output_nodes: list(string)")
+    .Attr("InT: list({float32})")
+    .Attr("OutT: list({float32})")
+    .Input("in_tensor: InT")
+    .Output("out_tensor: OutT")
+    .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
--- a/tensorflow/contrib/tensorrt/python/init.py
+++ b/tensorflow/contrib/tensorrt/python/init.py
@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the python wrapper for TensorRT graph transforms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long
+from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph
+# pylint: enable=unused-import,line-too-long
--- a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
+++ b/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
@ -0,0 +1,34 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the Python wrapper of TRTEngineOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+
+if platform.system() != "Windows":
+  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
+  from tensorflow.contrib.tensorrt.ops.gen_trt_engine_op import *
+
+  from tensorflow.contrib.util import loader
+  from tensorflow.python.platform import resource_loader
+  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
+
+  _trt_engine_op = loader.load_op_library(
+      resource_loader.get_path_to_datafile("_trt_engine_op.so"))
+else:
+  raise RuntimeError("Windows platforms are not supported")
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the Python wrapper conversion to trt_graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long
+import six as _six
+from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import errors_impl as _impl
+from tensorflow.python.framework import ops
+
+
+# TODO(skama): get outputs from session when implemented as c++
+# optimization pass
+def create_inference_graph(input_graph_def,
+                           outputs,
+                           max_batch_size=1,
+                           max_workspace_size_bytes=2 << 20):
+  """Python wrapper for the TRT transormation.
+
+
+  Args:
+    input_graph_def: GraphDef object containing a model to be transformed.
+    outputs: List of tensors or node names for the model outputs.
+    max_batch_size: max size for the input batch
+    max_workspace_size_bytes: parameter to control memory allocation (in Bytes)
+
+  Returns:
+    New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
+
+  Raises:
+    RuntimeError: if the returned status message is malformed.
+  """
+
+  def py2bytes(inp):
+    return inp
+
+  def py3bytes(inp):
+    return inp.encode("utf-8", errors="surrogateescape")
+
+  def py2string(inp):
+    return inp
+
+  def py3string(inp):
+    return inp.decode("utf-8")
+
+  if _six.PY2:
+    to_bytes = py2bytes
+    to_string = py2string
+  else:
+    to_bytes = py3bytes
+    to_string = py3string
+
+  out_names = []
+  for i in outputs:
+    if isinstance(i, ops.Tensor):
+      out_names.append(to_bytes(i.name))
+    else:
+      out_names.append(to_bytes(i))
+
+  input_graph_def_str = input_graph_def.SerializeToString()
+
+  # TODO(sami): Fix this when we can return status from C++ library
+  # There is a problem with the TF internal library setup that doesn't
+  # allow us to return a status object from C++.  Thus we return a
+  # pair or strings where first one is encoded status and the second
+  # one is the transformed graphs protobuf string.
+  out = trt_convert(input_graph_def_str, out_names, max_batch_size,
+                    max_workspace_size_bytes)
+  status = to_string(out[0])
+  output_graph_def_string = out[1]
+  del input_graph_def_str  # Save some memory
+  if len(status) < 2:
+    raise _impl.UnknownError(None, None, status)
+  if status[:2] != "OK":
+    msg = status.split(";")
+    if len(msg) == 1:
+      raise RuntimeError("Status message is malformed {}".format(status))
+    # pylint: disable=protected-access
+    raise _impl._make_specific_exception(None, None, ";".join(msg[1:]),
+                                         int(msg[0]))
+    # pylint: enable=protected-access
+  output_graph_def = graph_pb2.GraphDef()
+  output_graph_def.ParseFromString(output_graph_def_string)
+  del output_graph_def_string  # Save some memory
+  return output_graph_def
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@ -0,0 +1,253 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/segment/segment.h"
+
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/tensorrt/segment/union_find.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+
+namespace {
+
+bool CanContractEdge(const tensorflow::Edge* edge,
+                     const tensorflow::Graph& graph) {
+  const tensorflow::Node* src = edge->src();
+  const tensorflow::Node* dst = edge->dst();
+
+  // Can't contract edge if doing so would cause a cycle in the
+  // graph. So, if there is a directed path from 'src' to 'dst', other
+  // than 'edge' (or any other direct edge from 'src' to 'dst'), then
+  // combining 'src' and 'dst' will cause a cycle along that path.
+  //
+  // In practice, to avoid modifying the graph and to take advantage
+  // of existing graph functions, we perform an equivalent.
+  //   1. Get all nodes incoming to 'dst', excluding 'src'
+  //   2. Reverse DFS from those nodes
+  //   3. If reverse DFS reaches 'src' then we have a cycle
+  std::vector<tensorflow::Node*> dfs_start_nodes;
+  for (tensorflow::Node* node : dst->in_nodes()) {
+    if (node != src) {
+      dfs_start_nodes.push_back(node);
+    }
+  }
+
+  bool is_cycle = false;
+  if (!dfs_start_nodes.empty()) {
+    tensorflow::ReverseDFSFrom(graph, dfs_start_nodes, {},
+                               [&is_cycle, src](tensorflow::Node* node) {
+                                 if (node == src) {
+                                   is_cycle = true;
+                                 }
+                               });
+  }
+
+  return !is_cycle;
+}
+
+void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
+                  std::vector<const tensorflow::Edge*>* remove_edges) {
+  // Transfer all inputs and outputs of 'dst' to 'src' except edges
+  // connecting the two.
+  tensorflow::Node* src = edge->src();
+  tensorflow::Node* dst = edge->dst();
+
+  // We can use '0' for input/output index because we don't need them
+  // to be accurate for the way we are using the graph.
+  std::vector<const tensorflow::Edge*> in_edges(dst->in_edges().begin(),
+                                                dst->in_edges().end());
+  for (const tensorflow::Edge* in_edge : in_edges) {
+    if (in_edge->src() != src) {
+      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+      if (e->src() == graph->source_node()) {
+        graph->AddEdge(e->src(), e->src_output(), src,
+                       tensorflow::Graph::kControlSlot);
+      } else {
+        graph->AddEdge(e->src(), e->src_output(), src, 0 /* input index */);
+      }
+    }
+  }
+
+  std::vector<const tensorflow::Edge*> out_edges(dst->out_edges().begin(),
+                                                 dst->out_edges().end());
+  for (const tensorflow::Edge* out_edge : out_edges) {
+    tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+    if (e->dst() == graph->sink_node()) {
+      graph->AddEdge(src, tensorflow::Graph::kControlSlot, e->dst(),
+                     e->dst_input());
+    } else {
+      graph->AddEdge(src, 0 /* output index */, e->dst(), e->dst_input());
+    }
+  }
+
+  // Return the edges that must be removed to disconnect 'dst' from
+  // the graph. We don't actually remove 'dst' since the caller holds
+  // references to all the nodes.
+  for (const auto& in_edge : dst->in_edges()) {
+    remove_edges->push_back(in_edge);
+  }
+  for (const auto& out_edge : dst->out_edges()) {
+    remove_edges->push_back(out_edge);
+  }
+}
+
+}  // namespace
+
+tensorflow::Status SegmentGraph(
+    const tensorflow::GraphDef& gdef,
+    const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
+    const SegmentOptions& options, SegmentNodesVector* segments) {
+  // Create a Graph representation of the GraphDef.
+  tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
+                                             gdef.library());
+  tensorflow::Graph graph(flib);
+  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
+      tensorflow::GraphConstructorOptions(), gdef, &graph));
+
+  // tensorflow::DumpGraph("Pre-Segment", &graph);
+
+  // Use a union-find to collect the nodes that belong to the same
+  // segment. A node value of nullptr indicates that the node is not a
+  // candidate for TRT.
+  std::vector<UnionFind<tensorflow::Node*>> node_segments;
+  for (int i = 0; i < graph.num_node_ids(); ++i) {
+    tensorflow::Node* node = graph.FindNodeId(i);
+    if (options.exclude_node_list.count(node->name()) != 0 ||
+        !candidate_fn(node->def())) {
+      node = nullptr;
+    }
+    node_segments.emplace_back(node);
+  }
+
+  // The segmentation algorithm below visits nodes in reverse
+  // topological order and attempts to merge nodes along output
+  // edges. That means that subgraphs grow from the output-side of the
+  // network towards the inputs. In general this is not guaranteed to
+  // produce a globally optimal segmentation. In the future if we have
+  // a measure of how beneficial it is to include a given node in a
+  // TRT subgraph then we can revisit this algorithm to take advantage
+  // of that information.
+  std::vector<tensorflow::Node*> order;
+  tensorflow::GetPostOrder(graph, &order);
+
+  for (const tensorflow::Node* node : order) {
+    // All output nodes of 'node' have been visited...
+    VLOG(2) << "Trying node " << node->name();
+
+    // 'node' must be a TRT candidate...
+    if (node_segments[node->id()].Value() == nullptr) {
+      VLOG(2) << "... not a TRT candidate";
+      continue;
+    }
+
+    // Contract output edges to combine 'node' with output
+    // nodes. Iterate since combining two nodes may unblock other
+    // combining.
+    while (true) {
+      std::set<const tensorflow::Edge*> contract_edges;
+      for (const tensorflow::Edge* out_edge : node->out_edges()) {
+        VLOG(2) << "... out node " << out_edge->dst()->name();
+
+        // Out node must be TRT candidate...
+        if (node_segments[out_edge->dst()->id()].Value() == nullptr) {
+          VLOG(2) << "... ... not a TRT candidate";
+          continue;
+        }
+
+        if (CanContractEdge(out_edge, graph)) {
+          VLOG(2) << "... ... can contract";
+          contract_edges.insert(out_edge);
+        } else {
+          VLOG(2) << "... ... cannot contract, would form cycle";
+        }
+      }
+
+      if (contract_edges.empty()) {
+        break;
+      }
+
+      // Contract edges and collect the adjacent nodes into the same
+      // segment/subgraph.
+      while (!contract_edges.empty()) {
+        const tensorflow::Edge* contract_edge = *contract_edges.begin();
+        const tensorflow::Node* src = contract_edge->src();
+        const tensorflow::Node* dst = contract_edge->dst();
+
+        VLOG(2) << "Merge " << src->name() << " <- " << dst->name();
+        node_segments[src->id()].Merge(&node_segments[dst->id()]);
+
+        // Contracting the edge leaves disconnected graph edges.
+        // Remove these from the graph and from 'contract_edges' so we
+        // don't visit them again.
+        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(contract_edge);
+        std::vector<const tensorflow::Edge*> remove_edges;
+        ContractEdge(e, &graph, &remove_edges);
+
+        for (const tensorflow::Edge* r : remove_edges) {
+          contract_edges.erase(r);
+          graph.RemoveEdge(r);
+        }
+      }
+    }
+  }
+
+  // Collect the segments/subgraphs. Each subgraph is represented by a
+  // set of the names of the nodes in that subgraph.
+  std::unordered_map<string, std::set<string>> sg_map;
+  for (auto& u : node_segments) {
+    if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
+      sg_map[u.ParentValue()->name()].insert(u.Value()->name());
+    }
+  }
+
+  // Convert the segments into the expected return format
+  for (const auto& itr : sg_map) {
+    const auto& segment_node_names = itr.second;
+    if (VLOG_IS_ON(1)) {
+      string s;
+      for (const auto& name : segment_node_names) {
+        s += " " + name;
+      }
+      VLOG(1) << "Segment " << segments->size() << ":" << s;
+    }
+
+    // Don't use small segments.
+    if (static_cast<int>(segment_node_names.size()) <
+        options.minimum_segment_size) {
+      VLOG(1) << "Segment " << segments->size() << " has only "
+              << segment_node_names.size() << " nodes, dropping";
+      continue;
+    }
+
+    segments->emplace_back(segment_node_names);
+  }
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
+
+#include <set>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+
+using SegmentNodesVector = std::vector<std::set<string>>;
+
+struct SegmentOptions {
+  // Segment must contain at least this many nodes.
+  int minimum_segment_size = 2;
+  std::set<string> exclude_node_list;
+};
+
+// Get the subgraphs of a graph that can be handled by TensorRT.
+//
+// @param gdef The GraphDef describing the network
+// @param candidate_fn A function that returns true for a NodeDef if
+// that node can be handled by TensorRT.
+// @param segments Returns the TensorRT segments/subgraphs. Each entry
+// in the vector describes a subgraph by giving a set of the names of
+// all the NodeDefs in that subgraph.
+// @return the status.
+tensorflow::Status SegmentGraph(
+    const tensorflow::GraphDef& gdef,
+    const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
+    const SegmentOptions& options, SegmentNodesVector* segments);
+
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@ -0,0 +1,367 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+namespace test {
+
+class SegmentTest : public ::testing::Test {
+ public:
+  bool GetGraphDef(TF_Graph* graph, tensorflow::GraphDef* graph_def);
+
+  TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name);
+  TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                    TF_Status* s, const char* name);
+
+  std::function<bool(const NodeDef&)> MakeCandidateFn(
+      const std::set<string>& node_names);
+
+ protected:
+  void PlaceholderHelper(TF_Graph* graph, TF_Status* s, const char* name,
+                         TF_Operation** op);
+  void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                 TF_Status* s, const char* name, TF_Operation** op, bool check);
+
+  SegmentOptions default_options_;
+};
+
+bool SegmentTest::GetGraphDef(TF_Graph* graph,
+                              tensorflow::GraphDef* graph_def) {
+  TF_Status* s = TF_NewStatus();
+  TF_Buffer* buffer = TF_NewBuffer();
+  TF_GraphToGraphDef(graph, buffer, s);
+  bool ret = TF_GetCode(s) == TF_OK;
+  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  if (ret) ret = graph_def->ParseFromArray(buffer->data, buffer->length);
+  TF_DeleteBuffer(buffer);
+  TF_DeleteStatus(s);
+  return ret;
+}
+
+std::function<bool(const NodeDef&)> SegmentTest::MakeCandidateFn(
+    const std::set<string>& node_names) {
+  return [node_names](const NodeDef& node) -> bool {
+    return node_names.find(node.name()) != node_names.end();
+  };
+}
+
+void SegmentTest::PlaceholderHelper(TF_Graph* graph, TF_Status* s,
+                                    const char* name, TF_Operation** op) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
+  TF_SetAttrType(desc, "dtype", TF_INT32);
+  *op = TF_FinishOperation(desc, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_NE(*op, nullptr);
+}
+
+TF_Operation* SegmentTest::Placeholder(TF_Graph* graph, TF_Status* s,
+                                       const char* name) {
+  TF_Operation* op;
+  PlaceholderHelper(graph, s, name, &op);
+  return op;
+}
+
+void SegmentTest::AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                            TF_Status* s, const char* name, TF_Operation** op,
+                            bool check) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
+  TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
+  TF_AddInputList(desc, add_inputs, 2);
+  *op = TF_FinishOperation(desc, s);
+  if (check) {
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    ASSERT_NE(*op, nullptr);
+  }
+}
+
+TF_Operation* SegmentTest::Add(TF_Operation* l, TF_Operation* r,
+                               TF_Graph* graph, TF_Status* s,
+                               const char* name) {
+  TF_Operation* op;
+  AddHelper(l, r, graph, s, name, &op, true);
+  return op;
+}
+
+TEST_F(SegmentTest, Empty) {
+  TF_Graph* graph = TF_NewGraph();
+
+  GraphDef graph_def;
+  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
+
+  SegmentNodesVector segments;
+  ASSERT_EQ(
+      SegmentGraph(graph_def, MakeCandidateFn({}), default_options_, &segments),
+      tensorflow::Status::OK());
+
+  // Expect no segments/subgraphs.
+  EXPECT_TRUE(segments.empty());
+  TF_DeleteGraph(graph);
+}
+
+TEST_F(SegmentTest, Simple) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  //           feed
+  //         //    ||
+  //       add0    add1
+  //        | |    /
+  //        |  add2
+  //        |  /  ||
+  //       add3    add4
+  //           |  /
+  //          <sink>
+  //
+  TF_Operation* feed = Placeholder(graph, s, "feed");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
+
+  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
+  TF_Operation* add4 = Add(add2, add2, graph, s, "add4");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
+
+  GraphDef graph_def;
+  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
+
+  SegmentNodesVector segments;
+  ASSERT_EQ(
+      SegmentGraph(graph_def,
+                   MakeCandidateFn({"add0", "add1", "add2", "add3", "add4"}),
+                   default_options_, &segments),
+      tensorflow::Status::OK());
+
+  // Expect all Add operations to be collapsed into a single segment
+  ASSERT_EQ(segments.size(), 1);
+  std::vector<string> expected{"add0", "add1", "add2", "add3", "add4"};
+  for (const auto& ex : expected) {
+    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+        << "Missing expected node " << ex;
+  }
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST_F(SegmentTest, AvoidCycle) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // add2 is not a TRT candidate so add0/add3 cannot be formed as a
+  // subgraph
+  //
+  //           feed
+  //         //    ||
+  //       add0    add1
+  //        | |    /
+  //        |  add2
+  //        |  /  ||
+  //       add3    add4
+  //           |  /
+  //          <sink>
+  //
+  TF_Operation* feed = Placeholder(graph, s, "feed");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
+
+  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
+  TF_Operation* add4 = Add(add2, add2, graph, s, "add4");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
+
+  GraphDef graph_def;
+  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
+
+  SegmentNodesVector segments;
+  ASSERT_EQ(
+      SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add3", "add4"}),
+                   default_options_, &segments),
+      tensorflow::Status::OK());
+
+  // Expect no subgraphs
+  EXPECT_EQ(segments.size(), 0);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST_F(SegmentTest, Multiple) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // add5 is not a TRT candidate so two subgraphs should be formed
+  //
+  //                feed
+  //         //      ||     ||
+  //       add0    add1      add7
+  //        | |    /        /   ||
+  //        |  add2-----add5    add8
+  //        |  /  |    |  |    |
+  //       add3   add4     add6
+  //           |     |     /
+  //               <sink>
+  //
+  TF_Operation* feed = Placeholder(graph, s, "feed");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
+
+  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add7 = Add(feed, feed, graph, s, "add7");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add5 = Add(add2, add7, graph, s, "add5");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add8 = Add(add7, add7, graph, s, "add8");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
+  TF_Operation* add4 = Add(add2, add5, graph, s, "add4");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
+  TF_Operation* add6 = Add(add5, add8, graph, s, "add6");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add6"), string(TF_OperationName(add6)));
+
+  GraphDef graph_def;
+  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
+
+  SegmentNodesVector segments;
+  ASSERT_EQ(SegmentGraph(graph_def,
+                         MakeCandidateFn({"add0", "add1", "add2", "add3",
+                                          "add4", "add6", "add7", "add8"}),
+                         default_options_, &segments),
+            tensorflow::Status::OK());
+
+  // Expect two subgraphs
+  EXPECT_EQ(segments.size(), 2);
+
+  std::vector<string> expected0{"add0", "add1", "add2", "add3"};
+  for (const auto& ex : expected0) {
+    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+        << "Missing expected node " << ex;
+  }
+
+  std::vector<string> expected1{"add6", "add8"};
+  for (const auto& ex : expected1) {
+    EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
+        << "Missing expected node " << ex;
+  }
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST_F(SegmentTest, BigIfElse) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // add2 is not a TRT candidate
+  //
+  //           feed
+  //            ||
+  //           add0
+  //         //    ||
+  //       add1    add4
+  //        ||      ||
+  //       add2    add5
+  //        ||      ||
+  //       add3    add6
+  //         ||    //
+  //           add7
+  //            ||
+  //          <sink>
+  //
+  TF_Operation* feed = Placeholder(graph, s, "feed");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
+
+  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add1 = Add(add0, add0, graph, s, "add1");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add2 = Add(add1, add1, graph, s, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add3 = Add(add2, add2, graph, s, "add3");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add4 = Add(add0, add0, graph, s, "add4");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add5 = Add(add4, add4, graph, s, "add5");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add6 = Add(add5, add5, graph, s, "add6");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add7 = Add(add3, add6, graph, s, "add7");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add7"), string(TF_OperationName(add7)));
+
+  GraphDef graph_def;
+  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
+
+  SegmentNodesVector segments;
+  ASSERT_EQ(SegmentGraph(graph_def,
+                         MakeCandidateFn({"add0", "add1", "add3", "add4",
+                                          "add5", "add6", "add7"}),
+                         default_options_, &segments),
+            tensorflow::Status::OK());
+
+  // Expect 2 subgraphs
+  EXPECT_EQ(segments.size(), 2);
+
+  std::vector<string> expected0{"add3", "add4", "add5", "add6", "add7"};
+  for (const auto& ex : expected0) {
+    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+        << "Missing expected node " << ex;
+  }
+
+  std::vector<string> expected1{"add0", "add1"};
+  for (const auto& ex : expected1) {
+    EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
+        << "Missing expected node " << ex;
+  }
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+}  // namespace test
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
--- a/tensorflow/contrib/tensorrt/segment/union_find.h
+++ b/tensorflow/contrib/tensorrt/segment/union_find.h
@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+
+// Union-Find data structure.
+// Each cluster has an associated value; when merging clusters we can control
+// which value becomes the representative of the merged clusters. Values must be
+// copyable.
+template <typename T>
+class UnionFind {
+ public:
+  UnionFind() : size_(1), parent_(nullptr) {}
+  explicit UnionFind(const T& v) : size_(1), parent_(nullptr), value_(v) {}
+
+  // Returns the number of elements in a cluster.
+  int Size() { return FindRoot()->size_; }
+
+  // Merges this cluster with 'other'. This cluster's value becomes
+  // the value of the merged cluster; the value of 'other' is ignored.
+  void Merge(UnionFind* other);
+
+  // Each cluster has an associated value. Retrieves the value associated
+  // with this cluster.
+  T& ParentValue() { return FindRoot()->value_; }
+
+  // Get the original value of this node.
+  T& Value() { return value_; }
+
+ private:
+  // Finds the root element of the cluster. Performs path compression.
+  UnionFind* FindRoot();
+
+  int size_;
+  UnionFind* parent_;
+  T value_;
+};
+
+template <typename T>
+void UnionFind<T>::Merge(UnionFind* other) {
+  UnionFind<T>* a = FindRoot();
+  UnionFind<T>* b = other->FindRoot();
+  if (a == b) return;
+
+  b->parent_ = a;
+  a->size_ += b->size_;
+}
+
+template <typename T>
+UnionFind<T>* UnionFind<T>::FindRoot() {
+  if (!parent_) return this;
+  // Path compression: update intermediate nodes to point to the root of the
+  // equivalence class.
+  parent_ = parent_->FindRoot();
+  return parent_;
+}
+
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@ -0,0 +1,89 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
+
+#include <string>
+#include <vector>
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace shape_inference {
+
+tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
+  tensorflow::tensorrt::Logger logger;
+  string serialized_engine;
+  TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine));
+  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
+  nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
+      serialized_engine.c_str(), serialized_engine.size(), nullptr);
+
+  int num_batch = -1;
+  std::vector<::tensorflow::DataType> input_type;
+  TF_RETURN_IF_ERROR(context->GetAttr("InT", &input_type));
+  for (size_t i = 0; i < context->num_inputs(); i++) {
+    // Check if input shape is legit
+    auto input_shape = context->input(i);
+    for (int j = 0; j < context->Rank(input_shape); j++) {
+      auto dim_handler = context->Dim(input_shape, j);
+      if (j == 0) {
+        if (i == 0) {
+          num_batch = context->Value(dim_handler);
+        } else if (num_batch != context->Value(dim_handler)) {
+          // TODO(jie): TensorRT engine requires consistent batch between inputs
+          //            tensors. Segmenter should be aware of this.
+          LOG(FATAL) << "TensorRT engine requires consistent batch size";
+        }
+      }
+    }
+  }
+
+  // Arrange input here
+  std::vector<string> input_nodes;
+  TF_RETURN_IF_ERROR(context->GetAttr("input_nodes", &input_nodes));
+
+  // Arrange output here
+  std::vector<string> output_nodes;
+  TF_RETURN_IF_ERROR(context->GetAttr("output_nodes", &output_nodes));
+  for (size_t i = 0; i < output_nodes.size(); i++) {
+    int binding_index = trt_engine->getBindingIndex(output_nodes[i].c_str());
+    ShapeHandle output_shape;
+    std::vector<DimensionHandle> dim_vec;
+    dim_vec.emplace_back(context->MakeDim(num_batch));
+    if (binding_index != -1) {
+      auto dims = trt_engine->getBindingDimensions(binding_index);
+      for (int j = 0; j < dims.nbDims; j++) {
+        dim_vec.emplace_back(context->MakeDim(dims.d[j]));
+      }
+    } else {
+      LOG(FATAL) << "TensorRT engine cannot find binding: " << output_nodes[i];
+    }
+    output_shape = context->MakeShape(dim_vec);
+    context->set_output(i, output_shape);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace shape_inference
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace shape_inference {
+Status TRTEngineOpShapeInference(InferenceContext* c);
+}  // namespace shape_inference
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@ -0,0 +1,88 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+# normally we should do import tensorflow as tf and then
+# tf.placeholder, tf.constant, tf.nn.conv2d etc but
+# it looks like internal builds don't like it so
+# importing every module individually
+
+from tensorflow.contrib import tensorrt as trt
+from tensorflow.core.protobuf import config_pb2 as cpb2
+from tensorflow.python.client import session as csess
+from tensorflow.python.framework import constant_op as cop
+from tensorflow.python.framework import dtypes as dtypes
+from tensorflow.python.framework import importer as importer
+from tensorflow.python.framework import ops as ops
+from tensorflow.python.ops import array_ops as aops
+from tensorflow.python.ops import nn as nn
+from tensorflow.python.ops import nn_ops as nn_ops
+
+
+def get_simple_graph_def():
+  """Create a simple graph and return its graph_def."""
+  g = ops.Graph()
+  with g.as_default():
+    a = aops.placeholder(
+        dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+    e = cop.constant(
+        [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+        name="weights",
+        dtype=dtypes.float32)
+    conv = nn.conv2d(
+        input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
+    b = cop.constant(
+        [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
+    t = nn.bias_add(conv, b, name="biasAdd")
+    relu = nn.relu(t, "relu")
+    idty = aops.identity(relu, "ID")
+    v = nn_ops.max_pool(
+        idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+    aops.squeeze(v, name="output")
+  return g.as_graph_def()
+
+
+def run_graph(gdef, dumm_inp):
+  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  ops.reset_default_graph()
+  g = ops.Graph()
+  with g.as_default():
+    inp, out = importer.import_graph_def(
+        graph_def=gdef, return_elements=["input", "output"])
+    inp = inp.outputs[0]
+    out = out.outputs[0]
+  with csess.Session(
+      config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
+    val = sess.run(out, {inp: dumm_inp})
+  return val
+
+
+if "__main__" in __name__:
+  inp_dims = (100, 24, 24, 2)
+  dummy_input = np.random.random_sample(inp_dims)
+  gdef = get_simple_graph_def()
+  # Get optimized graph
+  trt_graph = trt.create_inference_graph(gdef, ["output"], inp_dims[0])
+  o1 = run_graph(gdef, dummy_input)
+  o2 = run_graph(trt_graph, dummy_input)
+  o3 = run_graph(trt_graph, dummy_input)
+  assert np.array_equal(o1, o2)
+  assert np.array_equal(o3, o2)  # sanity check
+  print("Pass")
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* Wrap trt_conversion */
+%{
+#define SWIG_FILE_WITH_INIT
+%}
+%include "std_pair.i"
+%include "tensorflow/python/platform/base.i"
+
+%{
+PyObject* pair_helper(std::pair<string, string>* in) {
+  PyObject *first(nullptr), *second(nullptr), *tuple(nullptr);
+  first = PyBytes_FromStringAndSize(in->first.data(), in->first.length());
+  if (!first) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError, "Pair conversion first argument failed");
+    }
+    return NULL;
+  }
+  second = PyBytes_FromStringAndSize(in->second.data(), in->second.length());
+  if (!second) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError,
+                      "Pair conversion second argument failed");
+    }
+    return NULL;
+  }
+  tuple = Py_BuildValue("(OO)", first, second);
+  if (!tuple) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError,
+                      "Tuple creation from pair<string,string> failed!");
+    }
+    return NULL;
+  }
+  return tuple;
+}
+%}
+%typemap(out) std::pair<string, string> {
+  PyObject *tuple = pair_helper(&$1);
+  if (!tuple) SWIG_fail;
+  $result = tuple;
+}
+%{
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/stat_summarizer.h"
+#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+%}
+
+%ignoreall
+%unignore tensorflow;
+%unignore trt_convert;
+
+%{
+std::pair<string, string> trt_convert(
+    string graph_def_string,  // The serialized GraphDef string.
+    std::vector<string> output_names,
+    size_t max_batch_size,
+    size_t max_workspace_size_bytes
+    // Unfortunately we can't use TF_Status here since it
+    // is in c/c_api and brings in a lot of other libraries
+    // which in turn declare ops. These ops are included
+    // statically in our library and cause an abort when
+    // module is loaded due to double registration
+    // until Tensorflow properly exposes these headers
+    // we have to work around this by returning a string
+    // and converting it to exception on python side.
+    //,TF_Status* out_status) {
+) {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  string out_status;
+
+  tensorflow::GraphDef graph_def;
+  if (!graph_def.ParseFromString(graph_def_string)) {
+    out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
+    return std::pair<string, string>{out_status, ""};
+  }
+
+  if (!output_names.size()) {
+    out_status = "InvalidArgument;Size of the output_names vector is 0";
+    return std::pair<string, string>{out_status, ""};
+    // return "";
+  }
+  tensorflow::GraphDef outGraph;
+  tensorflow::Status conversion_status =
+      tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(
+          graph_def, output_names, max_batch_size, max_workspace_size_bytes,
+          &outGraph);
+  if (!conversion_status.ok()) {
+    auto retCode = (int)conversion_status.code();
+    char buff[2000];
+    snprintf(buff, 2000, "%d;%s", retCode,
+             conversion_status.error_message().c_str());
+    out_status = buff;
+    return std::pair<string, string>{out_status, ""};
+  }
+  string result;
+  if (!outGraph.SerializeToString(&result)) {
+    out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
+    return std::pair<string, string>{out_status, ""};
+  }
+  out_status = "OK;All good!";
+  return std::pair<string, string>{out_status, result};
+#else
+  // Returns FAILED_PRECONDITION.
+  return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+}
+%}
+
+std::pair<string, string> trt_convert(string graph_def_string,
+                                      std::vector<string> output_names,
+                                      size_t max_batch_size,
+                                      size_t max_workspace_size_bytes);
+
+
+%unignoreall
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@ -20,7 +20,7 @@ from __future__ import print_function

 from setuptools import setup

-_VERSION = '1.6.0-rc0'
+_VERSION = '1.6.0-rc1'

 CONSOLE_SCRIPTS = [
    'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
--- a/tensorflow/core/common_runtime/gpu/gpu_id.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id.h
@ -40,7 +40,7 @@ namespace tensorflow {
 //   a BaseGPUDevice. Note that the configuration allows us to create multiple
 //   BaseGPUDevice per GPU hardware in order to use multi CUDA streams on the
 //   hardware, so the mapping between TF GPU id and CUDA GPU id is not a 1:1
-//   mappping, see the example below.
+//   mapping, see the example below.
 //
 // For example, assuming that in the machine we have GPU device with index 0, 1,
 // 2 and 3 (physical GPU id). Setting "CUDA_VISIBLE_DEVICES=1,2,3" will create
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@ -21,7 +21,6 @@ limitations under the License.

 #ifdef INTEL_MKL

-#include <unistd.h>
 #include <cstdlib>
 #include <string>
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@ -191,9 +191,6 @@ class TensorShapeBase : public TensorShapeRep {
  /// Appends all the dimensions from `shape`.
  void AppendShape(const TensorShapeBase& shape);

-  // Maximum number of dimensions in a tensor.
-  static constexpr int MaxDimensions() { return 254; }
-
  /// \brief Insert a dimension somewhere in the `TensorShape`.
  /// REQUIRES: `0 <= d <= dims()`
  /// REQUIRES: `size >= 0`
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@ -222,7 +222,7 @@ Status MklToTfConversionPass::InsertInputConversionNode(
           BaseType(n->input_type(0)));

  // Check ordering of edges
-  for (uint i = 0; i < 4; i++) {
+  for (uint32 i = 0; i < 4; i++) {
    CHECK_EQ((edges[i]->dst_input() == i), true);
  }

--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@ -71,7 +71,7 @@ class RGBToHSVOp : public OpKernel {
                                        TensorShape({input_data.dimension(0)}),
                                        &trange));

-    typename TTypes<T, 1>::Tensor range = trange.tensor<T, 1>();
+    typename TTypes<T, 1>::Tensor range(trange.tensor<T, 1>());

    functor::RGBToHSV<Device, T>()(context->eigen_device<Device>(), input_data,
                                   range, output_data);
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@ -24,12 +24,12 @@ limitations under the License.
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"

-#if !defined(_MSC_VER)
-#define UNROLL _Pragma("unroll")
-#define NOUNROLL _Pragma("nounroll")
-#else
+#if defined(_MSC_VER) && !defined(__clang__)
 #define UNROLL
 #define NOUNROLL
+#else
+#define UNROLL _Pragma("unroll")
+#define NOUNROLL _Pragma("nounroll")
 #endif

 namespace tensorflow {
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@ -29,7 +29,6 @@ limitations under the License.
 #include <vector>
 #include "mkl_cblas.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@ -41,9 +40,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"

-#define MKL_Complex8 tensorflow::complex64
-#define MKL_Complex16 tensorflow::complex128
-
 namespace tensorflow {

 typedef Eigen::ThreadPoolDevice CPUDevice;
@ -180,16 +176,16 @@ class BatchMatMulMkl : public OpKernel {
  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
                         const bool TransB, const MKL_INT *M_Array,
                         const MKL_INT *N_Array, const MKL_INT *K_Array,
-                         const MKL_Complex8 **A_Array, const MKL_INT *lda_Array,
-                         const MKL_Complex8 **B_Array, const MKL_INT *ldb_Array,
-                         MKL_Complex8 **C_Array, const MKL_INT *ldc_Array,
+                         const complex64 **A_Array, const MKL_INT *lda_Array,
+                         const complex64 **B_Array, const MKL_INT *ldb_Array,
+                         complex64 **C_Array, const MKL_INT *ldc_Array,
                         const MKL_INT group_count, const MKL_INT *group_size) {
    std::vector<CBLAS_TRANSPOSE> TransA_array(
        group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
    std::vector<CBLAS_TRANSPOSE> TransB_array(
        group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
-    std::vector<MKL_Complex8> alpha_Array(group_size[0], {1.0f, 0.0f});
-    std::vector<MKL_Complex8> beta_Array(group_size[0], {0.0f, 0.0f});
+    std::vector<complex64> alpha_Array(group_size[0], {1.0f, 0.0f});
+    std::vector<complex64> beta_Array(group_size[0], {0.0f, 0.0f});
    cblas_cgemm_batch(
        Layout, &TransA_array[0], &TransB_array[0], M_Array, N_Array, K_Array,
        static_cast<const void *>(&alpha_Array[0]),
@ -202,18 +198,16 @@ class BatchMatMulMkl : public OpKernel {
  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
                         const bool TransB, const MKL_INT *M_Array,
                         const MKL_INT *N_Array, const MKL_INT *K_Array,
-                         const MKL_Complex16 **A_Array,
-                         const MKL_INT *lda_Array,
-                         const MKL_Complex16 **B_Array,
-                         const MKL_INT *ldb_Array, MKL_Complex16 **C_Array,
-                         const MKL_INT *ldc_Array, const MKL_INT group_count,
-                         const MKL_INT *group_size) {
+                         const complex128 **A_Array, const MKL_INT *lda_Array,
+                         const complex128 **B_Array, const MKL_INT *ldb_Array,
+                         complex128 **C_Array, const MKL_INT *ldc_Array,
+                         const MKL_INT group_count, const MKL_INT *group_size) {
    std::vector<CBLAS_TRANSPOSE> TransA_array(
        group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
    std::vector<CBLAS_TRANSPOSE> TransB_array(
        group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
-    std::vector<MKL_Complex16> alpha_Array(group_size[0], {1.0f, 0.0f});
-    std::vector<MKL_Complex16> beta_Array(group_size[0], {0.0f, 0.0f});
+    std::vector<complex128> alpha_Array(group_size[0], {1.0f, 0.0f});
+    std::vector<complex128> beta_Array(group_size[0], {0.0f, 0.0f});
    cblas_zgemm_batch(
        Layout, &TransA_array[0], &TransB_array[0], M_Array, N_Array, K_Array,
        static_cast<const void *>(&alpha_Array[0]),
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@ -145,8 +145,8 @@ class MklInputConversionOp : public OpKernel {
    const MklShape* mkl_shape;
    const Tensor* tf_tensor;
    MklShape* tf_mkl_shape;
-    uint mkl_tensor_index;
-    uint tf_tensor_index;
+    uint32 mkl_tensor_index;
+    uint32 tf_tensor_index;
    if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
      mkl_tensor = &input_tensor_0;
      mkl_shape = &input_shape_0;
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@ -170,32 +170,32 @@ class MklMatMulOp : public OpKernel {
  // Matrix-Matrix Multiplication with Complex64 (std::complex<float>) tensors.
  // For detailed info about parameters, look at FP32 function description.
  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
-                   const int k, const std::complex<float>* a, const int lda,
-                   const std::complex<float>* b, const int ldb,
-                   std::complex<float>* c, int const ldc) {
+                   const int k, const complex64* a, const int lda,
+                   const complex64* b, const int ldb, complex64* c,
+                   int const ldc) {
    const MKL_Complex8 alpha = {1.0f, 0.0f};
    const MKL_Complex8 beta = {0.0f, 0.0f};
    cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans, m, n, k,
-                static_cast<const void*>(&alpha), static_cast<const void*>(a),
-                lda, static_cast<const void*>(b), ldb,
-                static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
+                transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
+                reinterpret_cast<const MKL_Complex8*>(a), lda,
+                reinterpret_cast<const MKL_Complex8*>(b), ldb, &beta,
+                reinterpret_cast<MKL_Complex8*>(c), ldc);
  }

  // Matrix-Matrix Multiplication with Complex128 (std::complex<double>)
  // tensors. For detailed info about parameters, look at FP32 function
  // description.
  void MklBlasGemm(bool transa, bool transb, const int m, const int n,
-                   const int k, const std::complex<double>* a, const int lda,
-                   const std::complex<double>* b, const int ldb,
-                   std::complex<double>* c, const int ldc) {
+                   const int k, const complex128* a, const int lda,
+                   const complex128* b, const int ldb, complex128* c,
+                   const int ldc) {
    const MKL_Complex16 alpha = {1.0, 0.0};
    const MKL_Complex16 beta = {0.0, 0.0};
    cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans, m, n, k,
-                static_cast<const void*>(&alpha), static_cast<const void*>(a),
-                lda, static_cast<const void*>(b), ldb,
-                static_cast<const void*>(&beta), static_cast<void*>(c), ldc);
+                transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
+                reinterpret_cast<const MKL_Complex16*>(a), lda,
+                reinterpret_cast<const MKL_Complex16*>(b), ldb, &beta,
+                reinterpret_cast<MKL_Complex16*>(c), ldc);
  }
 };

--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@ -128,7 +128,7 @@ class MklToTfOp : public OpKernel {
 #else
  static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                             string data_format_str, DataType op_data_type,
-                             bool has_avx512f, uint input_number) {
+                             bool has_avx512f, uint32 input_number) {
    // Check that input tensor is in MKL format.
    const Tensor& input_tensor = MklGetInput(context, input_number);
    MklShape input_shape;
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@ -18,9 +18,6 @@ limitations under the License.
 #ifdef INTEL_MKL
 #define EIGEN_USE_THREADS

-#include "tensorflow/core/framework/numeric_types.h"
-#define MKL_Complex8 tensorflow::complex64
-#define MKL_Complex16 tensorflow::complex128
 #include "mkl_trans.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/kernels/transpose_op.h"
@ -62,10 +59,37 @@ Status MKLTranspose2D(const char trans, const Tensor& in, Tensor* out);

 INSTANTIATE(float, s)
 INSTANTIATE(double, d)
-INSTANTIATE(complex64, c)
-INSTANTIATE(complex128, z)
+
 #undef INSTANTIATE

+template <>
+Status MKLTranspose2D<complex64>(const char trans, const Tensor& in,
+                                 Tensor* out) {
+  const MKL_Complex8 alpha = {1.0f, 0.0f};
+  mkl_comatcopy(
+      'R', trans, in.dim_size(0), in.dim_size(1), alpha,
+      reinterpret_cast<const MKL_Complex8*>(in.flat<complex64>().data()),
+      in.dim_size(1),
+      reinterpret_cast<MKL_Complex8*>(
+          const_cast<complex64*>(out->flat<complex64>().data())),
+      in.dim_size(0));
+  return Status::OK();
+}
+
+template <>
+Status MKLTranspose2D<complex128>(const char trans, const Tensor& in,
+                                  Tensor* out) {
+  const MKL_Complex16 alpha = {1.0, 0.0};
+  mkl_zomatcopy(
+      'R', trans, in.dim_size(0), in.dim_size(1), alpha,
+      reinterpret_cast<const MKL_Complex16*>(in.flat<complex128>().data()),
+      in.dim_size(1),
+      reinterpret_cast<MKL_Complex16*>(
+          const_cast<complex128*>(out->flat<complex128>().data())),
+      in.dim_size(0));
+  return Status::OK();
+}
+
 static const char kMKLTranspose = 'T';
 static const char kMKLConjugateTranspose = 'C';

--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@ -105,7 +105,7 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
  }

  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
-  typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
+  TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();

  std::vector<float> scores_data(num_boxes);
  std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
@ -138,8 +138,7 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
  Tensor* output = nullptr;
  TensorShape output_shape({static_cast<int>(selected.size())});
  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-  typename TTypes<int, 1>::Tensor selected_indices_data =
-      output->tensor<int, 1>();
+  TTypes<int, 1>::Tensor selected_indices_data = output->tensor<int, 1>();
  std::copy_n(selected.begin(), selected.size(), selected_indices_data.data());
 }

--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"

-#ifdef COMPILER_MSVC
+#if defined(_MSC_VER) && !defined(__clang__)
 // msvc does not support unroll. One could try the loop pragma but we need to
 // take a closer look if this generates better code in this case. For now let
 // the compiler take care of it.
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
@ -697,8 +697,8 @@ class QuantizedResizeBilinearOp : public OpKernel {
    // Return if the output is empty.
    if (st.output->NumElements() == 0) return;

-    typename TTypes<T, 4>::ConstTensor image_data = input.tensor<T, 4>();
-    typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>();
+    typename TTypes<T, 4>::ConstTensor image_data(input.tensor<T, 4>());
+    typename TTypes<T, 4>::Tensor output_data(st.output->tensor<T, 4>());

    ResizeBilinear<T>(image_data, st.height_scale, st.width_scale, in_min,
                      in_max, &output_data);
--- a/tensorflow/core/kernels/random_crop_op.cc
+++ b/tensorflow/core/kernels/random_crop_op.cc
@ -92,8 +92,8 @@ class RandomCropOp : public OpKernel {

    // TODO(shlens): Do this more efficiently with memcpy once padding is
    // available for smaller images.
-    typename TTypes<T, 3>::ConstTensor input_data = input.tensor<T, 3>();
-    typename TTypes<T, 3>::Tensor output_data = output->tensor<T, 3>();
+    typename TTypes<T, 3>::ConstTensor input_data(input.tensor<T, 3>());
+    typename TTypes<T, 3>::Tensor output_data(output->tensor<T, 3>());

    for (int y = 0; y < target_height; ++y) {
      for (int x = 0; x < target_width; ++x) {
--- a/tensorflow/core/kernels/resize_area_op.cc
+++ b/tensorflow/core/kernels/resize_area_op.cc
@ -149,7 +149,7 @@ class ResizeAreaOp : public OpKernel {

    if (!context->status().ok()) return;

-    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
+    typename TTypes<T, 4>::ConstTensor input_data(input.tensor<T, 4>());

    // Precompute values used when iterating over x coordinates within a row.
    // Note that it may be useful to cache x_interps for a given
@ -190,8 +190,7 @@ class ResizeAreaOp : public OpKernel {
  void ComputeLoop(const ImageResizerState& st,
                   const std::vector<CachedInterpolation>& x_interps,
                   typename TTypes<T, 4>::ConstTensor input_data) {
-    typename TTypes<float, 4>::Tensor output_data =
-        st.output->tensor<float, 4>();
+    TTypes<float, 4>::Tensor output_data = st.output->tensor<float, 4>();

    // When using this algorithm for downsizing, the target pixel value is the
    // weighted average of all the source pixels. The weight is determined by
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@ -480,9 +480,8 @@ class ResizeBicubicOp : public OpKernel {

    if (!context->status().ok()) return;

-    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
-    typename TTypes<float, 4>::Tensor output_data =
-        st.output->tensor<float, 4>();
+    typename TTypes<T, 4>::ConstTensor input_data(input.tensor<T, 4>());
+    TTypes<float, 4>::Tensor output_data = st.output->tensor<float, 4>();

    interpolate_with_caching<T>(input_data, st, output_data);
  }
@ -510,9 +509,8 @@ class ResizeBicubicOpGrad : public OpKernel {

    if (!context->status().ok()) return;

-    typename TTypes<float, 4>::ConstTensor input_grad =
-        input.tensor<float, 4>();
-    typename TTypes<T, 4>::Tensor output_grad = st.output->tensor<T, 4>();
+    TTypes<float, 4>::ConstTensor input_grad = input.tensor<float, 4>();
+    typename TTypes<T, 4>::Tensor output_grad(st.output->tensor<T, 4>());

    ResizeBicubicGrad<T>(input_grad, st, output_grad);
  }
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@ -51,9 +51,8 @@ class ResizeBilinearOp : public OpKernel {
    // Return if the output is empty.
    if (st.output->NumElements() == 0) return;

-    typename TTypes<T, 4>::ConstTensor image_data = input.tensor<T, 4>();
-    typename TTypes<float, 4>::Tensor output_data =
-        st.output->tensor<float, 4>();
+    typename TTypes<T, 4>::ConstTensor image_data(input.tensor<T, 4>());
+    TTypes<float, 4>::Tensor output_data = st.output->tensor<float, 4>();

    functor::ResizeBilinear<Device, T>()(context->eigen_device<Device>(),
                                         image_data, st.height_scale,
@ -258,9 +257,8 @@ class ResizeBilinearOpGrad : public OpKernel {

    if (!context->status().ok()) return;

-    typename TTypes<float, 4>::ConstTensor input_grad =
-        input.tensor<float, 4>();
-    typename TTypes<T, 4>::Tensor output_grad = st.output->tensor<T, 4>();
+    TTypes<float, 4>::ConstTensor input_grad = input.tensor<float, 4>();
+    typename TTypes<T, 4>::Tensor output_grad(st.output->tensor<T, 4>());

    functor::ResizeBilinearGrad<Device, T>()(context->eigen_device<Device>(),
                                             input_grad, st.height_scale,
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@ -56,8 +56,8 @@ class ResizeNearestNeighborOp : public OpKernel {
    // Return if the output is empty.
    if (st.output->NumElements() == 0) return;

-    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
-    typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>();
+    typename TTypes<T, 4>::ConstTensor input_data(input.tensor<T, 4>());
+    typename TTypes<T, 4>::Tensor output_data(st.output->tensor<T, 4>());

    bool status;
    if (align_corners_) {
@ -162,8 +162,8 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
    // Return if the output is empty.
    if (output->NumElements() == 0) return;

-    typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
-    typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
+    typename TTypes<T, 4>::ConstTensor input_data(input.tensor<T, 4>());
+    typename TTypes<T, 4>::Tensor output_data(output->tensor<T, 4>());

    const float height_scale =
        CalculateResizeScale(out_height, in_height, align_corners_);
--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
@ -387,9 +387,9 @@ class SampleDistortedBoundingBoxV2Op : public OpKernel {
    OP_REQUIRES_OK(
        context, context->allocate_output(2, TensorShape({1, 1, 4}), &bboxes));

-    typename TTypes<T, 1>::Tensor begin_data = begin->tensor<T, 1>();
-    typename TTypes<T, 1>::Tensor size_data = size->tensor<T, 1>();
-    typename TTypes<float, 3>::Tensor bboxes_data = bboxes->tensor<float, 3>();
+    typename TTypes<T, 1>::Tensor begin_data(begin->tensor<T, 1>());
+    typename TTypes<T, 1>::Tensor size_data(size->tensor<T, 1>());
+    TTypes<float, 3>::Tensor bboxes_data = bboxes->tensor<float, 3>();

    begin_data(0) = T(offset_height);
    size_data(0) = T(target_height);
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@ -358,11 +358,11 @@ class MklSliceOp : public OpKernel {
        /* data format = NCHW */

 #pragma omp parallel for
-        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+        for (ssize_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
          T* ip = in_buf + (d0 * in_strides[0]);
          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
 #pragma omp parallel for
-          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+          for (ssize_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
            T* ip1 = ip + (d1 * in_strides[1]);
            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
            // For NCHW, H and W will be contiguous. So we can copy
@ -376,15 +376,15 @@ class MklSliceOp : public OpKernel {
        /* data_format = NHWC */

 #pragma omp parallel for
-        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+        for (ssize_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
          T* ip = in_buf + (d0 * in_strides[0]);
          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
 #pragma omp parallel for
-          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+          for (ssize_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
            T* ip1 = ip + (d1 * in_strides[1]);
            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
 #pragma omp parallel for
-            for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
+            for (ssize_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
              T* ip2 = ip1 + (d2 * in_strides[2]);
              T* ip3 = ip2 + begin[3];
              T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@ -115,7 +115,7 @@ class SubstrOp : public OpKernel {
          Tensor input_buffer;
          OP_REQUIRES_OK(context, context->allocate_temp(
                                      DT_STRING, output_shape, &input_buffer));
-          typename TTypes<string, 1>::Tensor input_bcast =
+          TTypes<string, 1>::Tensor input_bcast =
              input_buffer.shaped<string, 1>(bcast.result_shape());
          input_bcast =
              input.broadcast(BCast::ToIndexArray<1>(bcast.x_bcast()));
@ -125,8 +125,8 @@ class SubstrOp : public OpKernel {
          OP_REQUIRES_OK(context,
                         context->allocate_temp(DataTypeToEnum<T>::v(),
                                                output_shape, &pos_buffer));
-          typename TTypes<T, 1>::Tensor pos_bcast =
-              pos_buffer.shaped<T, 1>(bcast.result_shape());
+          typename TTypes<T, 1>::Tensor pos_bcast(
+              pos_buffer.shaped<T, 1>(bcast.result_shape()));
          pos_bcast =
              pos_shaped.broadcast(BCast::ToIndexArray<1>(bcast.y_bcast()));

@ -135,8 +135,8 @@ class SubstrOp : public OpKernel {
          OP_REQUIRES_OK(context,
                         context->allocate_temp(DataTypeToEnum<T>::v(),
                                                output_shape, &len_buffer));
-          typename TTypes<T, 1>::Tensor len_bcast =
-              len_buffer.shaped<T, 1>(bcast.result_shape());
+          typename TTypes<T, 1>::Tensor len_bcast(
+              len_buffer.shaped<T, 1>(bcast.result_shape()));
          len_bcast =
              len_shaped.broadcast(BCast::ToIndexArray<1>(bcast.y_bcast()));

@ -164,7 +164,7 @@ class SubstrOp : public OpKernel {
          Tensor input_buffer;
          OP_REQUIRES_OK(context, context->allocate_temp(
                                      DT_STRING, output_shape, &input_buffer));
-          typename TTypes<string, 2>::Tensor input_bcast =
+          TTypes<string, 2>::Tensor input_bcast =
              input_buffer.shaped<string, 2>(bcast.result_shape());
          input_bcast =
              input.broadcast(BCast::ToIndexArray<2>(bcast.x_bcast()));
@ -174,8 +174,8 @@ class SubstrOp : public OpKernel {
          OP_REQUIRES_OK(context,
                         context->allocate_temp(DataTypeToEnum<T>::v(),
                                                output_shape, &pos_buffer));
-          typename TTypes<T, 2>::Tensor pos_bcast =
-              pos_buffer.shaped<T, 2>(bcast.result_shape());
+          typename TTypes<T, 2>::Tensor pos_bcast(
+              pos_buffer.shaped<T, 2>(bcast.result_shape()));
          pos_bcast =
              pos_shaped.broadcast(BCast::ToIndexArray<2>(bcast.y_bcast()));

@ -184,8 +184,8 @@ class SubstrOp : public OpKernel {
          OP_REQUIRES_OK(context,
                         context->allocate_temp(DataTypeToEnum<T>::v(),
                                                output_shape, &len_buffer));
-          typename TTypes<T, 2>::Tensor len_bcast =
-              len_buffer.shaped<T, 2>(bcast.result_shape());
+          typename TTypes<T, 2>::Tensor len_bcast(
+              len_buffer.shaped<T, 2>(bcast.result_shape()));
          len_bcast =
              len_shaped.broadcast(BCast::ToIndexArray<2>(bcast.y_bcast()));

--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@ -27,9 +27,6 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty();

 #include <stdlib.h>
 #include <cstring>
-#if 0
-#include <omp.h>
-#endif

 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
@ -360,7 +357,6 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
  l_tick6 = libxsmm_timer_tick();
 #endif

-#if 1
  BlockingCounter counter(num_threads);

  for (int i = 0; i < num_threads; ++i) {
@ -371,14 +367,6 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
    });
  }
  counter.Wait();
-#else
-#pragma omp parallel
-  {
-    chk_libxsmm_err(
-        libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, omp_get_thread_num()),
-        "Worker");
-  }
-#endif

 #if defined(LIBXSMM_DETAILED_TIMING)
  l_tick7 = libxsmm_timer_tick();
--- a/tensorflow/core/lib/io/record_writer.cc
+++ b/tensorflow/core/lib/io/record_writer.cc
@ -49,7 +49,7 @@ RecordWriterOptions RecordWriterOptions::CreateRecordWriterOptions(
 #endif  // IS_SLIM_BUILD
  } else if (compression_type != compression::kNone) {
    LOG(ERROR) << "Unsupported compression_type:" << compression_type
-               << ". No comprression will be used.";
+               << ". No compression will be used.";
  }
  return options;
 }
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@ -619,6 +619,10 @@ REGISTER_OP("NonMaxSuppression")
      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
      DimensionHandle unused;
+      // The boxes[0] and scores[0] are both num_boxes.
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+      // The boxes[1] is 4.
      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));

      c->set_output(0, c->Vector(c->UnknownDim()));
@ -643,6 +647,10 @@ REGISTER_OP("NonMaxSuppressionV2")
      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
      DimensionHandle unused;
+      // The boxes[0] and scores[0] are both num_boxes.
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+      // The boxes[1] is 4.
      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));

      c->set_output(0, c->Vector(c->UnknownDim()));
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@ -43,10 +43,11 @@ limitations under the License.
 #elif defined(__arm__)
 #define PLATFORM_POSIX

-// Require an outside macro to tell us if we're building for Raspberry Pi.
-#if !defined(RASPBERRY_PI)
+// Require an outside macro to tell us if we're building for Raspberry Pi or
+// another ARM device that's not a mobile platform.
+#if !defined(RASPBERRY_PI) && !defined(ARM_NON_MOBILE)
 #define IS_MOBILE_PLATFORM
-#endif  // !defined(RASPBERRY_PI)
+#endif  // !defined(RASPBERRY_PI) && !defined(ARM_NON_MOBILE)

 #else
 // If no platform specified, use:
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@ -387,7 +387,7 @@ message RunOptions {
  // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
  DebugOptions debug_options = 6;

-  // When enabled, causes tensor alllocation information to be included in
+  // When enabled, causes tensor allocation information to be included in
  // the error message when the Run() call fails because the allocator ran
  // out of memory (OOM).
  //
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@ -24,7 +24,7 @@ limitations under the License.

 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX "-rc1"

 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@ -1112,9 +1112,11 @@ inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
 // Forward the MKL shape ONLY (used in elementwise and other ops where
 // we call the eigen implementation and MKL shape is not used)
 inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
-                                      uint idx_data_in, uint idx_data_out) {
-  uint idx_meta_in = GetTensorMetaDataIndex(idx_data_in, context->num_inputs());
-  uint idx_meta_out =
+                                      uint32 idx_data_in,
+                                      uint32_t idx_data_out) {
+  uint32 idx_meta_in =
+      GetTensorMetaDataIndex(idx_data_in, context->num_inputs());
+  uint32 idx_meta_out =
      GetTensorMetaDataIndex(idx_data_out, context->num_outputs());

  if (IsRefType(context->input_dtype(idx_data_in))) {
@ -1126,7 +1128,7 @@ inline void ForwardMklMetaDataInToOut(OpKernelContext* context,

 // Set a dummy MKL shape (called when the output is in TF format)
 inline void SetDummyMklShapeOutput(OpKernelContext* context,
-                                   uint idx_data_out) {
+                                   uint32 idx_data_out) {
  MklShape mkl_shape_output;
  mkl_shape_output.SetMklTensor(false);
  AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
--- a/tensorflow/docs_src/about/roadmap.md
+++ b/tensorflow/docs_src/about/roadmap.md
@ -1,37 +1,86 @@
 # Roadmap
-**Last updated: January 23, 2017**
+**Last updated: Feb 15, 2018**

-TensorFlow is a fast moving project. In order for the community to better
-understand what the near future will bring, this document shares what we are
-working on internally. Many of these features were requested by the community,
-and we welcome
-[contributions](https://github.com/tensorflow/tensorflow/labels/stat%3Acontributions%20welcome).
+TensorFlow is a rapidly moving, community supported project. This document is intended 
+to provide guidance about priorities and focus areas of the core set of TensorFlow 
+developers and about functionality that can be expected in the upcoming releases of 
+TensorFlow. Many of these areas are driven by  community use cases, and we welcome 
+further 
+[contributions](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md) 
+to TensorFlow.

-The features on this list are targeted for the next few months. At this point,
-we do not have timelines for these features.
+The features below do not have concrete release dates. However, the majority can be 
+expected in the next one to two releases. 

-### Improve non-Python language support
+### APIs
+#### High Level APIs:
+* Easy multi-GPU utilization with Estimators
+* Easy-to-use high-level pre-made estimators for Gradient Boosted Trees, Time Series, and other models

-* Support for adding gradient computation for graphs constructed in other
-  languages (C++, Java, Go etc.)
+#### Eager Execution:
+* Efficient utilization of multiple GPUs
+* Distributed training (multi-machine)
+* Performance improvements
+* Simpler export to a GraphDef/SavedModel 

-### Making TensorFlow easier to use
-* High-level APIs
-* Well-maintained models showing best practices
+#### Keras API:
+* Better integration with tf.data (ability to call `model.fit` with data tensors)
+* Full support for Eager Execution (both Eager support for the regular Keras API, and ability 
+to create Keras models Eager- style via Model subclassing)
+* Better distribution/multi-GPU support and TPU support (including a smoother model-to-estimator workflow)

-### Performance
-* Speed and memory benchmarks
-* Distributed full model benchmarks
-* Performance and memory usage improvements
+#### Official Models:
+* A set of 
+[reference models](https://github.com/tensorflow/models/tree/master/official) 
+across image recognition, speech, object detection, and 
+  translation that demonstrate best practices and serve as a starting point for 
+  high-performance model development.

-### Core Features
-* Automatic op placement ([#2126](https://github.com/tensorflow/tensorflow/issues/2126))
-* Support for graph-level functions
+#### Contrib:
+* Deprecation notices added to parts of tf.contrib where preferred implementations exist outside of tf.contrib.
+* As much as possible, large projects inside tf.contrib moved to separate repositories.
+* The tf.contrib module will eventually be discontinued in its current form, experimental development will in future happen in other repositories.
+
+
+#### Probabilistic Reasoning and Statistical Analysis:
+* Rich set of tools for probabilistic and statistical analysis in tf.distributions 
+  and tf.probability. These include new samplers, layers, optimizers, losses, and structured models
+* Statistical tools for hypothesis testing, convergence diagnostics, and sample statistics
+* Edward 2.0: High-level API for probabilistic programming

 ### Platforms
-* OpenCL support ([#22](https://github.com/tensorflow/tensorflow/issues/22))
+#### TensorFlow Lite:
+* Increased coverage of supported ops in TensorFlow Lite
+* Easier conversion of a trained TensorFlow graph for use on TensorFlow Lite
+* Support for GPU acceleration in TensorFlow Lite (iOS and Android)
+* Support for hardware accelerators via Android NeuralNets API 
+* Improved CPU performance by quantization and other network optimizations (eg. pruning, distillation)
+* Increased support for devices beyond Android and iOS (eg. RPi, Cortex-M)

-### Community
-* More educational resources
-* Better integration of TensorFlow into the opensource big data ecosystem (e.g.
-[#2655](https://github.com/tensorflow/tensorflow/issues/2655))
+### Performance
+#### Distributed TensorFlow:
+* Multi-GPU support optimized for a variety of GPU topologies
+* Improved mechanisms for distributing computations on several machines
+
+#### Optimizations:
+* Mixed precision training support with initial example model and guide
+* Native TensorRT support
+* Int8 support for SkyLake via MKL
+* Dynamic loading of SIMD-optimized kernels
+
+### Documentation and Usability:
+* Updated documentation, tutorials and Getting Started guides
+* Process to enable external contributions to tutorials, documentation, and blogs showcasing best practice use-cases of TensorFlow and high-impact applications
+
+### Community and Partner Engagement
+#### Special Interest Groups: 
+* Mobilizing the community to work together in focused domains
+* [tf-distribute](https://groups.google.com/a/tensorflow.org/forum/#!forum/tf-distribute)
+: build and packaging of TensorFlow
+* More to be identified and launched
+
+#### Community:
+* Incorporate public feedback on significant design decisions via a Request-for-Comment (RFC) process
+* Formalize process for external contributions to land in TensorFlow and associated projects 
+* Grow global TensorFlow communities and user groups
+* Collaborate with partners to co-develop and publish research papers
--- a/tensorflow/docs_src/about/uses.md
+++ b/tensorflow/docs_src/about/uses.md
@ -22,6 +22,14 @@ This section describes some of the current uses of the TensorFlow system.
 > TensorFlow, or even better, send us a pull request to add an entry to this
 > file.

+* **Deep Speech**
+<ul>
+   <li>**Organization**: Mozilla</li>
+   <li> **Domain**: Speech Recognition</li>
+   <li> **Description**:  A TensorFlow implementation motivated by Baidu's Deep Speech architecture.</li>
+   <li> **More info**: [GitHub Repo](https://github.com/mozilla/deepspeech)</li>
+</ul>
+
 * **RankBrain**
 <ul>
   <li>**Organization**: Google</li>
--- a/tensorflow/docs_src/deploy/index.md
+++ b/tensorflow/docs_src/deploy/index.md
@ -7,6 +7,8 @@ the following documents:
    a cluster of TensorFlow servers.
  * @{$hadoop$How to run TensorFlow on Hadoop}, which has a highly
    self-explanatory title.
+  * @{$s3$How to run TensorFlow with the S3 filesystem}, which explains how
+    to run TensorFlow with the S3 file system.
  * The entire document set for [TensorFlow serving](/serving), an open-source,
    flexible, high-performance serving system for machine-learned models
    designed for production environments. TensorFlow Serving provides
--- a/tensorflow/docs_src/deploy/leftnav_files
+++ b/tensorflow/docs_src/deploy/leftnav_files
@ -1,3 +1,4 @@
 index.md
 distributed.md
 hadoop.md
+s3.md
--- a/tensorflow/docs_src/deploy/s3.md
+++ b/tensorflow/docs_src/deploy/s3.md
@ -0,0 +1,40 @@
+# How to run TensorFlow on S3
+
+This document describes how to run TensorFlow on S3 file system.
+
+## S3
+
+We assume that you are familiar with @{$reading_data$reading data}.
+
+To use S3 with TensorFlow, change the file paths you use to read and write
+data to an S3 path. For example:
+
+```python
+filenames = ["s3://bucketname/path/to/file1.tfrecord",
+             "s3://bucketname/path/to/file2.tfrecord"]
+dataset = tf.data.TFRecordDataset(filenames)
+```
+
+When reading or writing data on S3 with your TensorFlow program, the behavior
+could be controlled by various environmental variables:
+
+*   **AWS_REGION**: By default, regional endpoint is used for S3, with region
+    controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then
+    `us-east-1` is used.
+*   **S3_ENDPOINT**: The endpoint could be overridden explicitly with
+    `S3_ENDPOINT` specified.
+*   **S3_USE_HTTPS**: HTTPS is used to access S3 by default, unless
+    `S3_USE_HTTPS=0`.
+*   **S3_VERIFY_SSL**: If HTTPS is used, SSL verification could be disabled
+    with `S3_VERIFY_SSL=0`.
+
+To read or write objects in a bucket that is no publicly accessible,
+AWS credentials must be provided through one of the following methods:
+
+*   Set credentials in the AWS credentials profile file on the local system,
+    located at: `~/.aws/credentials` on Linux, macOS, or Unix, or
+    `C:\Users\USERNAME\.aws\credentials` on Windows.
+*   Set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment
+    variables.
+*   If TensorFlow is deployed on an EC2 instance, specify an IAM role and then
+    give the EC2 instance access to that role.
--- a/tensorflow/docs_src/extend/add_filesys.md
+++ b/tensorflow/docs_src/extend/add_filesys.md
@ -81,6 +81,8 @@ filesystem implementations call their existing libraries. Examples include:
    plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hadoop/hadoop_file_system.h)
 *   [GCS
    plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/cloud/gcs_file_system.h)
+*   [S3
+    plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/s3/s3_file_system.h)

 #### The File interfaces

--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@ -38,7 +38,7 @@ enable TensorFlow for C:
         OS="linux" # Change to "darwin" for macOS
         TARGET_DIRECTORY="/usr/local"
         curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0-rc1.tar.gz" |
           sudo tar -C $TARGET_DIRECTORY -xz

     The `tar` command extracts the TensorFlow C library into the `lib`
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
         TF_TYPE="cpu" # Change to "gpu" for GPU support
         TARGET_DIRECTORY='/usr/local'
         curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0-rc1.tar.gz" |
         sudo tar -C $TARGET_DIRECTORY -xz

     The `tar` command extracts the TensorFlow C library into the `lib`
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
  <groupId>org.tensorflow</groupId>
  <artifactId>tensorflow</artifactId>
-  <version>1.6.0-rc0</version>
+  <version>1.6.0-rc1</version>
 </dependency>
 ```

@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
               <dependency>
                 <groupId>org.tensorflow</groupId>
                 <artifactId>tensorflow</artifactId>
-                 <version>1.6.0-rc0</version>
+                 <version>1.6.0-rc1</version>
               </dependency>
             </dependencies>
         </project>
@ -123,12 +123,12 @@ instead:
 <dependency>
  <groupId>org.tensorflow</groupId>
  <artifactId>libtensorflow</artifactId>
-  <version>1.6.0-rc0</version>
+  <version>1.6.0-rc1</version>
 </dependency>
 <dependency>
  <groupId>org.tensorflow</groupId>
  <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.6.0-rc0</version>
+  <version>1.6.0-rc1</version>
 </dependency>
 ```

@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:

  1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc1.jar),
     which is the TensorFlow Java Archive (JAR).

  2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
         OS=$(uname -s | tr '[:upper:]' '[:lower:]')
         mkdir -p ./jni
         curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc1.tar.gz" |
           tar -xz -C ./jni

 ### Install on Windows
@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:

  1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc1.jar),
     which is the TensorFlow Java Archive (JAR).
  2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc1.zip).
  3. Extract this .zip file.


@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:

-<pre><b>javac -cp libtensorflow-1.6.0-rc0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.6.0-rc1.jar HelloTF.java</b></pre>


 ### Running
@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:

-<pre><b>java -cp libtensorflow-1.6.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.6.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>

 And the following command line executes the `HelloTF` program on Windows:

-<pre><b>java -cp libtensorflow-1.6.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.6.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>d

 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
     Virtualenv environment:

     <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>

 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@ -293,7 +293,7 @@ take the following steps:

     <pre>
     $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl</b>
     </pre>

     If this step fails, see
@ -480,8 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:

     <pre>
     (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
-
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>

 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@ -648,14 +647,14 @@ This section documents the relevant values for Linux installations.
 CPU only:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp27-none-linux_x86_64.whl
 </pre>


 GPU support:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp27-none-linux_x86_64.whl
 </pre>

 Note that GPU support requires the NVIDIA hardware and software described in
@ -667,14 +666,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>


 GPU support:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>

 Note that GPU support requires the NVIDIA hardware and software described in
@ -686,14 +685,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>


 GPU support:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>


@ -705,14 +704,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>


 GPU support:

 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>


--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
     TensorFlow in the active Virtualenv is as follows:

     <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py3-none-any.whl</b></pre>

 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@ -242,7 +242,7 @@ take the following steps:
     issue the following command:

     <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py3-none-any.whl</b> </pre>

     If the preceding command fails, see
     [installation problems](#common-installation-problems).
@ -351,7 +351,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
     TensorFlow for Python 2.7:

     <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py2-none-any.whl</b></pre>


 <a name="ValidateYourInstallation"></a>
@ -524,7 +524,7 @@ This section documents the relevant values for Mac OS installations.


 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py2-none-any.whl
 </pre>


@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-a


 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc1-py3-none-any.whl
 </pre>
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@ -359,10 +359,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package

-for TensorFlow 1.6.0rc0 on Linux:
+for TensorFlow 1.6.0rc1 on Linux:

 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0rc0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0rc1-py2-none-any.whl</b>
 </pre>

 ## Validate your installation
@ -393,7 +393,7 @@ TensorFlow programs:

 <pre>Hello, TensorFlow!</pre>

-If you are new to TensorFlow, see @{$get_started$Getting Started with
+If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with
 TensorFlow}.

 If the system outputs an error message instead of a greeting, see [Common
@ -460,8 +460,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
@ -479,7 +479,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@ -493,8 +493,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.6.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
--- a/Show More
+++ b/Show More