Merge changes from github.

Change: 151046259
2017-03-23 12:31:16 -08:00 · 2017-03-23 12:31:16 -08:00 · bc456e361d
commit bc456e361d
parent 8ca0714565
141 changed files with 4407 additions and 602 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 .DS_Store
 .ipynb_checkpoints
 node_modules
+/.bazelrc
 /bazel-*
 /third_party/py/numpy/numpy_include
 /tools/bazel.rc
--- a/RELEASE.md
+++ b/RELEASE.md
@ -1,3 +1,10 @@
+# Release 1.0.1
+
+## Bug Fixes and Other Changes
+* Change GraphConstructor to not increase the version when importing, but instead take the min of all versions.
+* Google Cloud Storage fixes.
+* Removed `tf.core` and `tf.python` modules from the API. These were never intended to be exposed. Please use the same objects through top-level `tf` module instead.
+
 # Release 1.0.0

 ## Major Features and Improvements
@ -88,6 +95,8 @@ To help you upgrade your existing TensorFlow Python code to match the API change
  from the tensorflow::ops namespace to tensorflow.
 * Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args.
 * tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn.  They will be moved back into core for TF 1.1.
+* `tf.nn.sampled_softmax_loss` and `tf.nn.nce_loss` have both changed their API such that you need to switch the `inputs, labels` to `labels, inputs` parameters.
+* The shape keyword argument of the `SparseTensor` constructor changes its name to `dense_shape` between Tensorflow 0.12 and Tensorflow 1.0.

 ## Bug Fixes and Other Changes
 * Numerous C++ API updates.
--- a/7
+++ b/7
@ -14,12 +14,7 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")

 closure_repositories()

-load("//tensorflow:workspace.bzl", "check_version", "tf_workspace")
-
-# We must check the bazel version before trying to parse any other BUILD files,
-# in case the parsing of those build files depends on the bazel version we
-# require here.
-check_version("0.4.2")
+load("//tensorflow:workspace.bzl", "tf_workspace")

 # Uncomment and update the paths in these entries to build the Android demo.
 #android_sdk_repository(
--- a/65
+++ b/65
@ -8,6 +8,9 @@ pushd `dirname $0` > /dev/null
 SOURCE_BASE_DIR=`pwd -P`
 popd > /dev/null

+# This file contains customized config settings.
+touch .bazelrc
+
 PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"

 function is_linux() {
@ -36,15 +39,11 @@ function is_windows() {
 }

 function bazel_clean_and_fetch() {
-  # bazel clean --expunge currently doesn't work on Windows
-  # TODO(pcloudy): Re-enable it after bazel clean --expunge is fixed.
-  if ! is_windows; then
-    bazel clean --expunge
-  fi
  if [ -z "$TF_BAZEL_TARGETS" ]; then
-    TF_BAZEL_TARGETS="//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
+    bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
+  else
+    bazel fetch $TF_BAZEL_TARGETS
  fi
-  bazel fetch "$TF_BAZEL_TARGETS"
 }

 function sed_hyphen_i() {
@ -102,8 +101,8 @@ if false; then # Disable building with MKL for now

  if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL
    DST=`dirname $0`
-    ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170110.tgz
-    GITHUB_RELEASE_TAG=v0.3
+    ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170209.tgz
+    GITHUB_RELEASE_TAG=v0.5
    MKLURL="https://github.com/01org/mkl-dnn/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
    if ! [ -e "$DST/third_party/mkl/$ARCHIVE_BASENAME" ]; then
      wget --no-check-certificate -P $DST/third_party/mkl/ $MKLURL
@ -182,13 +181,12 @@ else
  TF_NEED_JEMALLOC=0
 fi

-if [ "$TF_NEED_JEMALLOC" == "1" ]; then
-  sed_hyphen_i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl
-else
-  sed_hyphen_i -e "s/WITH_JEMALLOC = True/WITH_JEMALLOC = False/" tensorflow/core/platform/default/build_config.bzl
+sed_hyphen_i -e "/with_jemalloc/d" .bazelrc
+if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then
+  echo 'build --define with_jemalloc=true' >>.bazelrc
 fi

-while [ "$TF_NEED_GCP" == "" ]; do
+while [[ "$TF_NEED_GCP" == "" ]]; do
  read -p "Do you wish to build TensorFlow with "\
 "Google Cloud Platform support? [y/N] " INPUT
  case $INPUT in
@ -202,23 +200,12 @@ while [ "$TF_NEED_GCP" == "" ]; do
  esac
 done

-if [ "$TF_NEED_GCP" == "1" ]; then
-  ## Verify that libcurl header files are available.
-  # Only check Linux, since on MacOS the header files are installed with XCode.
-  if is_linux && [[ ! -f "/usr/include/curl/curl.h" ]]; then
-    echo "ERROR: It appears that the development version of libcurl is not "\
-"available. Please install the libcurl3-dev package."
-    exit 1
+sed_hyphen_i -e "/with_gcp_support/d" .bazelrc
+if [[ "$TF_NEED_GCP" == "1" ]]; then
+  echo 'build --define with_gcp_support=true' >>.bazelrc
 fi

-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/WITH_GCP_SUPPORT = False/WITH_GCP_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
-else
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/WITH_GCP_SUPPORT = True/WITH_GCP_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
-fi
-
-while [ "$TF_NEED_HDFS" == "" ]; do
+while [[ "$TF_NEED_HDFS" == "" ]]; do
  read -p "Do you wish to build TensorFlow with "\
 "Hadoop File System support? [y/N] " INPUT
  case $INPUT in
@ -232,16 +219,13 @@ while [ "$TF_NEED_HDFS" == "" ]; do
  esac
 done

-if [ "$TF_NEED_HDFS" == "1" ]; then
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/WITH_HDFS_SUPPORT = False/WITH_HDFS_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
-else
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/WITH_HDFS_SUPPORT = True/WITH_HDFS_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
+sed_hyphen_i -e "/with_hdfs_support/d" .bazelrc
+if [[ "$TF_NEED_HDFS" == "1" ]]; then
+  echo 'build --define with_hdfs_support=true' >>.bazelrc
 fi

 ## Enable XLA.
-while [ "$TF_ENABLE_XLA" == "" ]; do
+while [[ "$TF_ENABLE_XLA" == "" ]]; do
  read -p "Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N] " INPUT
  case $INPUT in
    [Yy]* ) echo "XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=1;;
@ -251,12 +235,9 @@ while [ "$TF_ENABLE_XLA" == "" ]; do
  esac
 done

-if [ "$TF_ENABLE_XLA" == "1" ]; then
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/^WITH_XLA_SUPPORT = [FT].*/WITH_XLA_SUPPORT = True/" tensorflow/core/platform/default/build_config_root.bzl
-else
-  # Update Bazel build configuration.
-  sed_hyphen_i -e "s/^WITH_XLA_SUPPORT = [FT].*/WITH_XLA_SUPPORT = False/" tensorflow/core/platform/default/build_config_root.bzl
+sed_hyphen_i -e "/with_xla_support/d" .bazelrc
+if [[ "$TF_ENABLE_XLA" == "1" ]]; then
+  echo 'build --define with_xla_support=true' >>.bazelrc
 fi


--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -110,6 +110,34 @@ config_setting(
    visibility = ["//visibility:public"],
 )

+# TODO(jhseu): Enable on other platforms other than Linux.
+config_setting(
+    name = "with_jemalloc",
+    values = {
+        "cpu": "k8",
+        "define": "with_jemalloc=true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_gcp_support",
+    values = {"define": "with_gcp_support=true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_hdfs_support",
+    values = {"define": "with_hdfs_support=true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_xla_support",
+    values = {"define": "with_xla_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 package_group(
    name = "internal",
    packages = ["//tensorflow/..."],
@ -321,6 +349,8 @@ cc_binary(
    deps = [
        "//tensorflow/c:c_api",
        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:scope",
        "//tensorflow/core:tensorflow",
    ],
 )
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@ -138,7 +138,8 @@ tensorflow::Status AllocationTracker::DeallocateShape(
    TF_RET_CHECK(ShapeUtil::TupleElementCount(shape) == elements.size())
        << "tuple has unexpected number of elements: " << elements.size()
        << " != " << ShapeUtil::TupleElementCount(shape);
-    for (int i = 0; i < elements.size(); ++i) {
+    for (std::vector<se::DeviceMemoryBase>::size_type i = 0;
+         i < elements.size(); ++i) {
      VLOG(2) << "recursing onto the tuple elements";
      TF_RETURN_IF_ERROR(DeallocateShape(backend, device_ordinal, &elements[i],
                                         shape.tuple_shapes(i),
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@ -118,10 +118,10 @@ GenericTransferManager::ShallowCopyTupleFromDevice(

  // Create a DeviceMemoryBase from each void* pointer.
  std::vector<se::DeviceMemoryBase> destination;
-  for (int i = 0; i < element_pointers.size(); ++i) {
+  for (std::vector<void*>::size_type i = 0; i < element_pointers.size(); ++i) {
    if (element_pointers[i] == nullptr &&
        !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
-      return FailedPrecondition("tuple contains nullptr at element %d", i);
+      return FailedPrecondition("tuple contains nullptr at element %lu", i);
    }
    int64 buffer_size = ShapeUtil::ByteSizeOf(shape.tuple_shapes(i),
                                              /*pointer_size=*/sizeof(void*));
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@ -256,7 +256,8 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
    tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
    const Backend* backend, int device_ordinal) {
  std::vector<const Allocation*> allocations;
-  for (int i = 0; i < arguments.size(); ++i) {
+  for (tensorflow::gtl::ArraySlice<const GlobalDataHandle*>::size_type i = 0; 
+       i < arguments.size(); ++i) {
    auto allocation_status = allocation_tracker_.Resolve(*arguments[i]);
    if (!allocation_status.ok()) {
      return Status(allocation_status.status().code(),
@ -269,7 +270,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
    if (allocation->backend() != backend ||
        allocation->device_ordinal() != device_ordinal) {
      return InvalidArgument(
-          "argument %d is on device %s but computation will be executed "
+          "argument %lu is on device %s but computation will be executed "
          "on device %s",
          i,
          allocation->backend()
@ -295,13 +296,14 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
                           program_shape.parameters_size(), arguments.size());
  }

-  for (int i = 0; i < arguments.size(); ++i) {
+  for (tensorflow::gtl::ArraySlice<const Allocation*>::size_type i = 0;
+       i < arguments.size(); ++i) {
    // Verify that shape of arguments matches the shape of the arguments in the
    // ProgramShape.
    if (!ShapeUtil::Compatible(arguments[i]->shape(),
                               program_shape.parameters(i))) {
      return InvalidArgument(
-          "computation expects parameter %d to have shape %s, given shape %s",
+          "computation expects parameter %lu to have shape %s, given shape %s",
          i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
          ShapeUtil::HumanString(arguments[i]->shape()).c_str());
    }
@ -383,7 +385,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
                          hlo_dumper, std::move(executors)));

  if (!other_directory_path.empty()) {
-    for (int64 i = 0; i < versioned_handles.size(); ++i) {
+    for (std::vector<VersionedComputationHandle>::size_type i = 0;
+         i < versioned_handles.size(); ++i) {
      executables[i]->set_session_module(std::move(session_modules[i]));
    }
  }
@ -523,7 +526,8 @@ Service::ExecuteParallelAndRegisterResult(

  // Asynchronously launch all executables.
  std::vector<GlobalDataHandle> result_handles;
-  for (int64 i = 0; i < executables.size(); i++) {
+  for (tensorflow::gtl::ArraySlice<Executable*>::size_type i = 0;
+       i < executables.size(); i++) {
    TF_ASSIGN_OR_RETURN(
        perftools::gputools::DeviceMemoryBase result,
        executables[i]->ExecuteAsyncOnStream(&run_options[i], arguments[i]));
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@ -72,13 +72,17 @@ LINKER_SCRIPT = "//tensorflow/contrib/android:jni/version_script.lds"
 cc_binary(
    name = "libtensorflow_inference.so",
    srcs = [],
-    copts = tf_copts(),
+    copts = tf_copts() + [
+        "-ffunction-sections",
+        "-fdata-sections",
+    ],
    linkopts = if_android([
        "-landroid",
        "-llog",
        "-lm",
        "-z defs",
        "-s",
+        "-Wl,--gc-sections",
        "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
        LINKER_SCRIPT,
    ]),
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -56,9 +56,10 @@ mark_as_advanced(DOWNLOAD_LOCATION)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
-  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
+  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
+  add_definitions(-DTF_COMPILE_LIBRARY)
  add_definitions(-DNDEBUG /O2)  # Equivalent of -c opt in Bazel.
  add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
  # Suppress warnings to reduce build log size.
@ -190,6 +191,7 @@ if (tensorflow_ENABLE_GPU)
      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
      ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_HOME}/include/cudnn.h
      ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
      DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
    )
    include_directories(${tensorflow_source_dir}/third_party/gpus)
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@ -13,7 +13,7 @@ Linux.
 Current Status
 --------------

-CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/get_started/os_setup.html#pip-installation-on-windows)
+CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/install/install_windows)
 for instructions on how to install a pre-built TensorFlow package on Windows.

 ### Current known limitations
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@ -120,3 +120,43 @@ list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs})

 add_library(tf_cc OBJECT ${tf_cc_srcs})
 add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
+
+set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+add_custom_target(tf_extension_ops)
+
+function(AddUserOps)
+  cmake_parse_arguments(_AT "" "" "TARGET;SOURCES;GPUSOURCES;DEPENDS;DISTCOPY" ${ARGN})
+  if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
+    # if gpu build is enabled and we have gpu specific code,
+    # hint to cmake that this needs to go to nvcc
+    set (gpu_source ${_AT_GPUSOURCES})
+    set (gpu_lib "${_AT_TARGET}_gpu")
+    set_source_files_properties(${gpu_source} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+    cuda_compile(gpu_lib ${gpu_source})
+  endif()
+  # create shared library from source and cuda obj
+  add_library(${_AT_TARGET} SHARED ${_AT_SOURCES} ${gpu_lib})
+  target_link_libraries(${_AT_TARGET} ${pywrap_tensorflow_lib})
+  if(WIN32)
+    if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
+        # some ops call out to cuda directly; need to link libs for the cuda dlls
+        target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
+    endif()
+    if (_AT_DISTCOPY)
+        add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
+    endif()
+  endif()
+  if (_AT_DEPENDS)
+    add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
+  endif()
+  # make sure TF_COMPILE_LIBRARY is not defined for this target
+  get_target_property(target_compile_flags  ${_AT_TARGET} COMPILE_FLAGS)
+  if(target_compile_flags STREQUAL "target_compile_flags-NOTFOUND")
+    set(target_compile_flags "/UTF_COMPILE_LIBRARY")
+  else()
+    set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
+  endif()
+  set_target_properties(${_AT_TARGET} PROPERTIES COMPILE_FLAGS ${target_compile_flags})
+  add_dependencies(tf_extension_ops ${_AT_TARGET})
+endfunction(AddUserOps)
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@ -199,7 +199,6 @@ add_custom_command(OUTPUT
    COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
    --raw_generate ${VERSION_INFO_CC}
    DEPENDS __force_rebuild)
-
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)

 ########################################################
@ -238,3 +237,9 @@ add_dependencies(tf_core_framework
    tf_core_lib
    proto_text
 )
+
+if(WIN32)
+  # Cmake > 3.6 will quote this as -D"__VERSION__=\"MSVC\"" which nvcc fails on.
+  # Instead of defining this global, limit it to tf_core_framework where its used.
+  target_compile_definitions(tf_core_framework PRIVATE __VERSION__="MSVC")
+endif()
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@ -93,6 +93,12 @@ if(WIN32)
      "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
+      # no in tensorflow.dll - comes from .so
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
  )
  list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
 endif(WIN32)
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@ -623,12 +623,7 @@ add_custom_command(
      COMMENT "Running SWIG to generate Python wrappers"
      VERBATIM )

-# pywrap_tensorflow_internal is a shared library containing all of the
-# TensorFlow runtime and the standard ops and kernels. These are installed into
-# tf_python/tensorflow/python/.
-# TODO(mrry): Refactor this to expose a framework library that
-# facilitates `tf.load_op_library()`.
-add_library(pywrap_tensorflow_internal SHARED
+set (pywrap_tensorflow_internal_src
    "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.h"
    "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.cc"
    "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.h"
@ -652,6 +647,20 @@ add_library(pywrap_tensorflow_internal SHARED
    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.h"
    "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.cc"
+)
+
+if(WIN32)
+    # Windows: build a static library with the same objects as tensorflow.dll.
+    # This can be used to build for a standalone exe and also helps us to
+    # find all symbols that need to be exported from the dll which is needed
+    # to provide the tensorflow c/c++ api in tensorflow.dll.
+    # From the static library we create the def file with all symbols that need to
+    # be exported from tensorflow.dll. Because there is a limit of 64K sybmols
+    # that can be exported, we filter the symbols with a python script to the namespaces
+    # we need.
+    #
+    add_library(pywrap_tensorflow_internal_static STATIC
+        ${pywrap_tensorflow_internal_src}
        $<TARGET_OBJECTS:tf_core_lib>
        $<TARGET_OBJECTS:tf_core_cpu>
        $<TARGET_OBJECTS:tf_core_framework>
@ -663,6 +672,47 @@ add_library(pywrap_tensorflow_internal SHARED
        $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
        $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
    )
+    target_include_directories(pywrap_tensorflow_internal_static PUBLIC
+        ${PYTHON_INCLUDE_DIR}
+        ${NUMPY_INCLUDE_DIR}
+    )
+    target_link_libraries(pywrap_tensorflow_internal_static
+        tf_protos_cc
+        tf_python_protos_cc
+    )
+    set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.def")
+    set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
+
+    add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
+        COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
+            --input $<TARGET_FILE:pywrap_tensorflow_internal_static>
+            --output ${pywrap_tensorflow_deffile}
+    )
+endif(WIN32)
+
+
+# pywrap_tensorflow_internal is a shared library containing all of the
+# TensorFlow runtime and the standard ops and kernels. These are installed into
+# tf_python/tensorflow/python/.
+add_library(pywrap_tensorflow_internal SHARED
+    ${pywrap_tensorflow_internal_src}
+    $<TARGET_OBJECTS:tf_core_lib>
+    $<TARGET_OBJECTS:tf_core_cpu>
+    $<TARGET_OBJECTS:tf_core_framework>
+    $<TARGET_OBJECTS:tf_core_ops>
+    $<TARGET_OBJECTS:tf_core_direct_session>
+    $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
+    $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
+    $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
+    ${pywrap_tensorflow_deffile}
+)
+
+if(WIN32)
+    add_dependencies(pywrap_tensorflow_internal pywrap_tensorflow_internal_static)
+endif(WIN32)
+
 target_include_directories(pywrap_tensorflow_internal PUBLIC
    ${PYTHON_INCLUDE_DIR}
    ${NUMPY_INCLUDE_DIR}
@ -675,6 +725,44 @@ target_link_libraries(pywrap_tensorflow_internal
    ${PYTHON_LIBRARIES}
 )

+if(WIN32)
+    # include contrib/rnn as .so
+    #
+    set(tf_gru_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
+    )
+    set(tf_gru_gpu_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops_gpu.cu.cc"
+    )
+
+    set(tf_lstm_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
+    )
+    set(tf_lstm_gpu_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc"
+    )
+
+    AddUserOps(TARGET _gru_ops
+        SOURCES "${tf_gru_srcs}"
+        GPUSOURCES ${tf_gru_gpu_srcs}
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
+
+    AddUserOps(TARGET _lstm_ops
+        SOURCES "${tf_lstm_srcs}"
+        GPUSOURCES ${tf_lstm_gpu_srcs}
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
+endif(WIN32)
+
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
@ -684,14 +772,17 @@ add_dependencies(tf_python_build_pip_package
    tensorboard_copy_dependencies
    tf_python_copy_scripts_to_destination
    tf_python_touchup_modules
-    tf_python_ops)
+    tf_python_ops
+    tf_extension_ops)
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/setup.py
                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
 if(WIN32)
  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.dll
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd)
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
 else()
  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@ -115,7 +115,14 @@ if (tensorflow_BUILD_PYTHON_TESTS)
  #

  # include all test
+  if (WIN32)
+    file(GLOB_RECURSE tf_test_rnn_src_py
+      "${tensorflow_source_dir}/tensorflow/contrib/rnn/python/kernel_tests/*_test.py"
+    )
+  endif()
+ 
  file(GLOB_RECURSE tf_test_src_py
+    ${tf_test_rnn_src_py}
    "${tensorflow_source_dir}/tensorflow/python/debug/cli/*_test.py"
    "${tensorflow_source_dir}/tensorflow/python/debug/lib/*_test.py"
    "${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py"
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@ -106,3 +106,22 @@ target_link_libraries(${compare_graphs} PUBLIC
  ${tf_core_gpu_kernels_lib}
  ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+set(benchmark_model "benchmark_model")
+
+add_executable(${benchmark_model}
+    "${tensorflow_source_dir}/tensorflow/tools/benchmark/benchmark_model.cc"
+    "${tensorflow_source_dir}/tensorflow/tools/benchmark/benchmark_model_main.cc"
+    $<TARGET_OBJECTS:tf_core_lib>
+    $<TARGET_OBJECTS:tf_core_cpu>
+    $<TARGET_OBJECTS:tf_core_framework>
+    $<TARGET_OBJECTS:tf_core_ops>
+    $<TARGET_OBJECTS:tf_core_direct_session>
+    $<TARGET_OBJECTS:tf_core_kernels>
+)
+
+target_link_libraries(${benchmark_model} PUBLIC
+  tf_protos_cc
+  ${tf_core_gpu_kernels_lib}
+  ${tensorflow_EXTERNAL_LIBRARIES}
+)
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@ -0,0 +1,134 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""
+create_def_file.py - tool to create a windows def file to export
+symbols from tensorflow.dll to enable tf.load_library().
+Because the linker allows only 64K symbols to be exported per dll
+we filter the symbols down to the essentials. The regular expressions
+we use for this are specific to tensorflow.
+
+TODO: this works fine but there is an issue with exporting
+'const char * const' and importing it from a user_ops. The problem is
+on the importing end and using __declspec(dllimport) works around it.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import io
+import os
+import re
+import sys
+import tempfile
+from subprocess import Popen, PIPE
+
+# External tools we use that come with visual studio sdk and
+# we assume that the caller has the correct PATH to the sdk
+UNDNAME = "undname.exe"
+DUMPBIN = "dumpbin.exe"
+
+# Exclude if matched
+EXCLUDE_RE = re.compile(r"deleting destructor|::internal::")
+
+# Include if matched before exclude
+INCLUDEPRE_RE = re.compile(r"tensorflow::internal::LogMessage|" +
+                           r"tensorflow::internal::CheckOpMessageBuilder")
+
+# Include if matched after exclude
+INCLUDE_RE = re.compile(r"^(TF_\w*)$|" +
+                        r"tensorflow::|" +
+                        r"functor::|" +
+                        r"perftools::gputools")
+
+
+def get_args():
+  """Parse command line."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--input", help="input library", required=True)
+  parser.add_argument("--output", help="output deffile", required=True)
+  args = parser.parse_args()
+  return args
+
+
+def main():
+  """main."""
+  args = get_args()
+
+  # Pipe dumpbin to extract all linkable symbols from a lib.
+  # Good symbols are collected in candidates and also written to
+  # a temp file.
+  candidates = []
+  tmpfile = tempfile.NamedTemporaryFile(mode="w", delete=False)
+  proc = Popen([DUMPBIN, "/nologo", "/linkermember:1", args.input], stdout=PIPE)
+  for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
+    cols = line.split()
+    if len(cols) < 2:
+      continue
+    sym = cols[1]
+    tmpfile.file.write(sym + "\n")
+    candidates.append(sym)
+  tmpfile.file.close()
+  exit_code = proc.wait()
+  if exit_code != 0:
+    print("{} failed, exit={}".format(DUMPBIN, exit_code))
+    return exit_code
+
+  # Run the symbols through undname to get their undecorated name
+  # so we can filter on something readable.
+  with open(args.output, "w") as def_fp:
+    # track dupes
+    taken = set()
+
+    # Header for the def file. Since the tensorflow.dll is actually called
+    # _pywrap_tensorflow.pyd in the python wheel, hint that in the def file.
+    def_fp.write("LIBRARY _pywrap_tensorflow_internal.pyd\n")
+    def_fp.write("EXPORTS\n")
+    def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
+
+    # Each symbols returned by undname matches the same position in candidates.
+    # We compare on undname but use the decorated name from candidates.
+    dupes = 0
+    proc = Popen([UNDNAME, tmpfile.name], stdout=PIPE)
+    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+      decorated = candidates[idx]
+      if decorated in taken:
+        # Symbol is already in output, done.
+        dupes += 1
+        continue
+
+      if not INCLUDEPRE_RE.search(line):
+        if EXCLUDE_RE.search(line):
+          continue
+        if not INCLUDE_RE.search(line):
+          continue
+
+      def_fp.write("\t" + decorated + "\n")
+      taken.add(decorated)
+  exit_code = proc.wait()
+  if exit_code != 0:
+    print("{} failed, exit={}".format(UNDNAME, exit_code))
+    return exit_code
+
+  os.unlink(tmpfile.name)
+
+  print("symbols={}, taken={}, dupes={}"
+        .format(len(candidates), len(taken), dupes))
+  return 0
+
+
+if __name__ == "__main__":
+  sys.exit(main())
--- a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
@ -280,10 +280,11 @@ def init_from_checkpoint(checkpoint_dir, assignment_map):
      for var_name in scope_variables:
        # Lookup name with specified prefix and suffix from current variable.
        # If tensor_name given is '/' (root), don't use it for full name.
+        full_tensor_name = var_name[len(scopes):]
+        if current_var_or_name != "/":
+          full_tensor_name = full_tensor_name[1:]
        if tensor_name_in_ckpt != "/":
-          full_tensor_name = tensor_name_in_ckpt + var_name[len(scopes) + 1:]
-        else:
-          full_tensor_name = var_name[len(scopes) + 1:]
+          full_tensor_name = tensor_name_in_ckpt + full_tensor_name
        if full_tensor_name not in variable_map:
          raise ValueError(
              "Tensor %s (%s in %s) is not found in %s checkpoint" % (
--- a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
@ -168,6 +168,29 @@ class CheckpointsTest(test.TestCase):
        self.assertAllEqual(my3.eval(session), v3)
        self.assertAllEqual(my4.eval(session), v4)

+  def testInitToRootCheckpoint(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        my1 = variable_scope.get_variable("var1", [1, 10])
+        my2 = variable_scope.get_variable("var2", [10, 10])
+        my3 = variable_scope.get_variable("var3", [100, 100])
+        with variable_scope.variable_scope("useful_scope"):
+          my4 = variable_scope.get_variable("var4", [9, 9])
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                              {"/": "/",})
+
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my1.eval(session), v1)
+        self.assertAllEqual(my2.eval(session), v2)
+        self.assertAllEqual(my3.eval(session), v3)
+        self.assertAllEqual(my4.eval(session), v4)
+
  def testInitFromPartitionVar(self):
    checkpoint_dir = self.get_temp_dir()
    with self.test_session() as session:
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@ -30,11 +30,15 @@
    net = layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1')
    net = layers.conv2d(net, 256, [5, 5], scope='conv2')
  ```
-  The first call to conv2d will use predefined args:
-    layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', ..., scope='conv1')
+  The first call to conv2d will behave as follows:
+    layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID',
+                  initializer=layers.variance_scaling_initializer(),
+                  regularizer=layers.l2_regularizer(0.05), scope='conv1')

-  The second call to conv2d will overwrite padding:
-    layers.conv2d(inputs, 256, [5, 5], padding='SAME', ..., scope='conv2')
+  The second call to conv2d will also use the arg_scope's default for padding:
+    layers.conv2d(inputs, 256, [5, 5], padding='SAME',
+                  initializer=layers.variance_scaling_initializer(),
+                  regularizer=layers.l2_regularizer(0.05), scope='conv2')

  Example of how to reuse an arg_scope:

@ -49,7 +53,7 @@
    net = layers.conv2d(net, 256, [5, 5], scope='conv2')
  ```

-  Example of how to use tf.contrib.framework.add_arg_scope:
+  Example of how to use tf.contrib.framework.add_arg_scope to enable your function to be called within an arg_scope later:

  @tf.contrib.framework.add_arg_scope
  def conv2d(*args, **kwargs)
--- a/tensorflow/contrib/layers/init.py
+++ b/tensorflow/contrib/layers/init.py
@ -40,6 +40,7 @@ See the @{$python/contrib.layers} guide.
@@softmax
@@stack
@@unit_norm
+@@bow_encoder
@@embed_sequence

@@apply_regularization
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -160,9 +160,8 @@ def _fused_batch_norm(
  they need to be added as a dependency to the `train_op`, example:

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    if update_ops:
-      updates = tf.group(*update_ops)
-      total_loss = control_flow_ops.with_dependencies([updates], total_loss)
+    with tf.control_dependencies(update_ops):
+      train_op = optimizer.minimize(loss)

  One can set updates_collections=None to force the updates in place, but that
  can have speed penalty, especially in distributed settings.
@ -393,9 +392,8 @@ def batch_norm(inputs,
  they need to be added as a dependency to the `train_op`, example:

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    if update_ops:
-      updates = tf.group(*update_ops)
-      total_loss = control_flow_ops.with_dependencies([updates], total_loss)
+    with tf.control_dependencies(update_ops):
+      train_op = optimizer.minimize(loss)

  One can set updates_collections=None to force the updates in place, but that
  can have speed penalty, especially in distributed settings.
--- a/tensorflow/contrib/learn/init.py
+++ b/tensorflow/contrib/learn/init.py
@ -33,6 +33,7 @@ See the @{$python/contrib.learn} guide.
@@DNNLinearCombinedRegressor
@@DNNLinearCombinedEstimator
@@DNNLinearCombinedClassifier
+@@DynamicRnnEstimator
@@LinearClassifier
@@LinearEstimator
@@LinearRegressor
--- a/tensorflow/contrib/learn/python/learn/README.md
+++ b/tensorflow/contrib/learn/python/learn/README.md
@ -20,18 +20,17 @@ Optionally you can install [scikit-learn](http://scikit-learn.org/stable/) and [

 ### Tutorials

-   [TF Learn Quickstart](../../../../g3doc/tutorials/tflearn/index.md). Build,
+-   [TF Learn Quickstart](https://www.tensorflow.org/get_started/tflearn). Build,
    train, and evaluate a neural network with just a few lines of code.
-   [Input Functions](../../../../g3doc/tutorials/input_fn/index.md). Learn how
+-   [Input Functions](https://www.tensorflow.org/get_started/input_fn). Learn how
    to create input functions to feed data into your models.
-   [Linear Model](../../../../g3doc/tutorials/wide/index.md). Learn the basics
+-   [Linear Model](https://www.tensorflow.org/tutorials/wide). Learn the basics
    of building linear models.
-   [Wide and Deep
-    Learning](../../../../g3doc/tutorials/wide_and_deep/index.md). Jointly train
-    a linear model and a deep neural network.
-   [Logging and Monitoring](../../../../g3doc/tutorials/monitors/index.md). Use
-    the Monitor API to audit training of a neural network.
-   [Custom Estimators](../../../../g3doc/tutorials/estimators/index.md). Learn
+-   [Wide and Deep Learning](https://www.tensorflow.org/tutorials/wide_and_deep).
+    Jointly train a linear model and a deep neural network.
+-   [Logging and Monitoring](https://www.tensorflow.org/get_started/monitors).
+    Use the Monitor API to audit training of a neural network.
+-   [Custom Estimators](https://www.tensorflow.org/extend/estimators). Learn
    how to create a custom estimator.
 -   More coming soon.

--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@ -1108,7 +1108,7 @@ class Estimator(BaseEstimator):
    if isinstance(model_fn_results, model_fn_lib.ModelFnOps):
      return model_fn_results

-    # Here model_fn_ops should be a tuple with 3 elements.
+    # Here model_fn_results should be a tuple with 3 elements.
    if len(model_fn_results) != 3:
      raise ValueError('Unrecognized value returned by model_fn, '
                       'please return ModelFnOps.')
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@ -149,21 +149,16 @@ def _linear_model_fn(features, labels, mode, params, config=None):
      values=tuple(six.itervalues(features)),
      partitioner=partitioner) as scope:
    if joint_weights:
-      logits, _, _ = (
-          layers.joint_weighted_sum_from_feature_columns(
-              columns_to_tensors=features,
-              feature_columns=feature_columns,
-              num_outputs=head.logits_dimension,
-              weight_collections=[parent_scope],
-              scope=scope))
+      layer_fn = layers.joint_weighted_sum_from_feature_columns
    else:
-      logits, _, _ = (
-          layers.weighted_sum_from_feature_columns(
+      layer_fn = layers.weighted_sum_from_feature_columns
+        
+    logits, _, _ = layer_fn(
            columns_to_tensors=features,
            feature_columns=feature_columns,
            num_outputs=head.logits_dimension,
            weight_collections=[parent_scope],
-              scope=scope))
+            scope=scope)

    def _train_op_fn(loss):
      global_step = contrib_variables.get_global_step()
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
@ -63,9 +63,12 @@ def _assert_df_equals_dict(expected_df, actual_dict):
                                                             actual_dict[col]))


-def _make_test_csv():
+class TensorFlowDataFrameTestCase(test.TestCase):
+  """Tests for `TensorFlowDataFrame`."""
+
+  def _make_test_csv(self):
    f = tempfile.NamedTemporaryFile(
-      dir=test.get_temp_dir(), delete=False, mode="w")
+        dir=self.get_temp_dir(), delete=False, mode="w")
    w = csv.writer(f)
    w.writerow(["int", "float", "bool", "string"])
    for _ in range(100):
@ -79,10 +82,9 @@ def _make_test_csv():
    f.close()
    return f.name

-
-def _make_test_csv_sparse():
+  def _make_test_csv_sparse(self):
    f = tempfile.NamedTemporaryFile(
-      dir=test.get_temp_dir(), delete=False, mode="w")
+        dir=self.get_temp_dir(), delete=False, mode="w")
    w = csv.writer(f)
    w.writerow(["int", "float", "bool", "string"])
    for _ in range(100):
@ -98,9 +100,8 @@ def _make_test_csv_sparse():
    f.close()
    return f.name

-
-def _make_test_tfrecord():
-  f = tempfile.NamedTemporaryFile(dir=test.get_temp_dir(), delete=False)
+  def _make_test_tfrecord(self):
+    f = tempfile.NamedTemporaryFile(dir=self.get_temp_dir(), delete=False)
    w = tf_record.TFRecordWriter(f.name)
    for i in range(100):
      ex = example_pb2.Example()
@ -110,10 +111,6 @@ def _make_test_tfrecord():
      w.write(ex.SerializeToString())
    return f.name

-
-class TensorFlowDataFrameTestCase(test.TestCase):
-  """Tests for `TensorFlowDataFrame`."""
-
  def _assert_pandas_equals_tensorflow(self, pandas_df, tensorflow_df,
                                       num_batches, batch_size):
    self.assertItemsEqual(
@ -190,7 +187,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
    batch_size = 8
    enqueue_size = 7

-    data_path = _make_test_csv()
+    data_path = self._make_test_csv()
    default_values = [0, 0.0, 0, ""]

    pandas_df = pd.read_csv(data_path)
@ -211,7 +208,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
    num_epochs = 17
    expected_num_batches = (num_epochs * 100) // batch_size

-    data_path = _make_test_csv()
+    data_path = self._make_test_csv()
    default_values = [0, 0.0, 0, ""]

    tensorflow_df = df.TensorFlowDataFrame.from_csv(
@ -234,7 +231,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
    num_batches = 100
    batch_size = 8

-    data_path = _make_test_csv_sparse()
+    data_path = self._make_test_csv_sparse()
    feature_spec = {
        "int": parsing_ops.FixedLenFeature(None, dtypes.int16, np.nan),
        "float": parsing_ops.VarLenFeature(dtypes.float16),
@ -270,7 +267,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
    enqueue_size = 11
    batch_size = 13

-    data_path = _make_test_tfrecord()
+    data_path = self._make_test_tfrecord()
    features = {
        "fixed_len_float":
            parsing_ops.FixedLenFeature(
@ -318,7 +315,7 @@ class TensorFlowDataFrameTestCase(test.TestCase):
    num_epochs = 17
    expected_num_batches = (num_epochs * 100) // batch_size

-    data_path = _make_test_csv()
+    data_path = self._make_test_csv()
    default_values = [0, 0.0, 0, ""]

    tensorflow_df = df.TensorFlowDataFrame.from_csv(
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@ -261,7 +261,7 @@ def streaming_false_negatives(predictions, labels, weights=None,
                              metrics_collections=None,
                              updates_collections=None,
                              name=None):
-  """Computes the total number of false positives.
+  """Computes the total number of false negatives.

  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.

--- a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
+++ b/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
@ -13,7 +13,14 @@
 # limitations under the License.
 # ==============================================================================

-"""Module implementing RNN Cells."""
+"""Module implementing RNN Cells.
+
+This module provides a number of basic commonly used RNN cells, such as LSTM
+(Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number of
+operators that allow adding dropouts, projections, or embeddings for inputs.
+Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by
+calling the `rnn` ops several times.
+"""

 from __future__ import absolute_import
 from __future__ import division
@ -146,12 +153,12 @@ class GRUCell(RNNCell):
    with _checked_scope(self, scope or "gru_cell", reuse=self._reuse):
      with vs.variable_scope("gates"):  # Reset gate and update gate.
        # We start with bias of 1.0 to not reset and not update.
+        value = sigmoid(_linear(
+          [inputs, state], 2 * self._num_units, True, 1.0))
        r, u = array_ops.split(
-            value=_linear(
-                [inputs, state], 2 * self._num_units, True, 1.0),
+            value=value,
            num_or_size_splits=2,
            axis=1)
-        r, u = sigmoid(r), sigmoid(u)
      with vs.variable_scope("candidate"):
        c = self._activation(_linear([inputs, r * state],
                                     self._num_units, True))
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@ -70,7 +70,7 @@ def _lstm_block_cell(x,
  cs = ci .* i + cs_prev .* f
  cs = clip(cs, cell_clip)

-  o = sigmoid(cs * wco + f)
+  o = sigmoid(cs * wco + o)
  co = tanh(cs)
  h = co .* o
  ```
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@ -486,7 +486,7 @@ class GreedyEmbeddingHelper(Helper):
    # Outputs are logits, use argmax to get the most probable id
    if not isinstance(outputs, ops.Tensor):
      raise TypeError("Expected outputs to be a single Tensor, got: %s" %
-                      outputs)
+                      type(outputs))
    sample_ids = math_ops.cast(
        math_ops.argmax(outputs, axis=-1), dtypes.int32)
    return sample_ids
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@ -44,8 +44,7 @@ def sequence_loss(logits, targets, weights,
      sequence. When using weights as masking set all valid timesteps to 1 and
      all padded timesteps to 0.
    average_across_timesteps: If set, sum the cost across the sequence
-      dimension and divide by the cost by the total label weight across
-      timesteps.
+      dimension and divide the cost by the total label weight across timesteps.
    average_across_batch: If set, sum the cost across the batch dimension and
      divide the returned cost by the batch size.
    softmax_loss_function: Function (labels-batch, inputs-batch) -> loss-batch
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@ -229,7 +229,7 @@ net = ...
 net = slim.conv2d(net, 256, [3, 3], scope='conv3_1')
 net = slim.conv2d(net, 256, [3, 3], scope='conv3_2')
 net = slim.conv2d(net, 256, [3, 3], scope='conv3_3')
-net = slim.max_pool2d(net, [2, 2], scope='pool3')
+net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```

 One way to reduce this code duplication would be via a `for` loop:
@ -238,14 +238,14 @@ One way to reduce this code duplication would be via a `for` loop:
 net = ...
 for i in range(3):
  net = slim.conv2d(net, 256, [3, 3], scope='conv3_' % (i+1))
-net = slim.max_pool2d(net, [2, 2], scope='pool3')
+net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```

 This can be made even cleaner by using TF-Slim's `repeat` operation:

 ```python
 net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
-net = slim.max_pool(net, [2, 2], scope='pool2')
+net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```

 Notice that the `slim.repeat` not only applies the same argument in-line, it
--- a/tensorflow/contrib/util/loader.py
+++ b/tensorflow/contrib/util/loader.py
@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function

 import os
+import re

 from tensorflow.python.framework import load_library
 from tensorflow.python.platform import resource_loader
@ -29,9 +30,9 @@ from tensorflow.python.platform import resource_loader
 def load_op_library(path):
  """Loads a contrib op library from the given path.

-  NOTE(mrry): On Windows, we currently assume that contrib op
+  NOTE(mrry): On Windows, we currently assume that some contrib op
  libraries are statically linked into the main TensorFlow Python
-  extension DLL.
+  extension DLL - use dynamically linked ops if the .so is present.

  Args:
    path: An absolute path to a shared object file.
@ -40,11 +41,17 @@ def load_op_library(path):
    A Python module containing the Python wrappers for Ops defined in the
    plugin.
  """
-  if os.name != 'nt':
+  if os.name == 'nt':
+    # To avoid makeing every user_ops aware of windows, re-write
+    # the file extension from .so to .dll.
+    path = re.sub('\.so$', '.dll', path)
+
+    # TODO: currently we have only some user_ops as .dll's on windows - don't try
+    #   to load them if the dll is not found. Once we have all of them
+    #   this check should be removed.
+    if not os.path.exists(path):
+      return None
  path = resource_loader.get_path_to_datafile(path)
  ret = load_library.load_op_library(path)
  assert ret, 'Could not load %s' % path
  return ret
-  else:
-    # NOTE(mrry):
-    return None
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -339,6 +339,7 @@ tf_cuda_library(
    hdrs = [
        "example/feature_util.h",
        "framework/allocator.h",
+        "framework/allocator_registry.h",
        "framework/attr_value_util.h",
        "framework/bfloat16.h",
        "framework/cancellation.h",
@ -408,7 +409,9 @@ tf_cuda_library(
            "util/memmapped_file_system.h",
            "util/memmapped_file_system_writer.h",
        ],
-    }),
+    }) + if_mkl([
+        "util/mkl_util.h",
+    ]),
    visibility = ["//visibility:public"],
    deps = [":framework_internal"],
 )
@ -707,7 +710,9 @@ cc_library(
        "//tensorflow/core/kernels:math_not_windows",
        "//tensorflow/core/kernels:quantized_ops",
    ]) + if_mkl([
-        "//tensorflow/core/kernels:mkl_ops",
+        "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_matmul_op",
+        "//tensorflow/core/kernels:mkl_tfconv_op",
    ]),
 )

@ -772,7 +777,7 @@ cc_library(
        "//tensorflow/core/kernels:constant_op",
        "//tensorflow/core/kernels:ops_testutil",
        "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/platform/default/build_config:gtest",  # + if_sycl([":sycl_runtime"]),
+        "//tensorflow/core/platform/default/build_config:gtest",  # + if_sycl([":sycl_runtime"])
    ],
 )

@ -1393,7 +1398,7 @@ tf_cuda_library(
        ":version_lib",
        "//tensorflow/core/kernels:bounds_check",
        "//third_party/eigen3",
-    ],
+    ] + if_mkl(["//third_party/mkl:intel_binary_blob"]),
    alwayslink = 1,
 )

@ -1495,7 +1500,8 @@ tf_cuda_library(
               "//tensorflow/core/grappler/optimizers:meta_optimizer",
               "//third_party/eigen3",
               "//tensorflow/core/kernels:required",
-    ] + tf_additional_core_deps(),
+           ] + if_mkl(["//third_party/mkl:intel_binary_blob"]) +
+           tf_additional_core_deps(),
    alwayslink = 1,
 )

@ -2037,11 +2043,14 @@ tf_cc_tests(
    ],
 )

-if_mkl(
 tf_cc_test_mkl(
    name = "mkl_related_tests",
    size = "small",
-        srcs = ["graph/mkl_optimizer_merge_test.cc"],
+    srcs = [
+        "graph/mkl_layout_pass_test.cc",
+        "graph/mkl_optimizer_merge_test.cc",
+        "graph/mkl_tfconversion_pass_test.cc",
+    ],
    linkstatic = tf_kernel_tests_linkstatic(),
    deps = [
        ":core",
@ -2053,17 +2062,19 @@ if_mkl(
        ":lib",
        ":lib_internal",
        ":ops",
-            ":protos_all_cc",  # under if_mkl
+        ":protos_all_cc",
        ":test",
        ":test_main",
        ":testlib",
        "//tensorflow/cc:cc_ops",
        "//tensorflow/cc:scope",
        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_matmul_op",
+        "//tensorflow/core/kernels:mkl_tfconv_op",
        "//tensorflow/core/kernels:ops_util",
        "//third_party/eigen3",
    ],
-    ),
 )

 tf_cc_tests_gpu(
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A simple CPU allocator that intercepts malloc/free calls from MKL library
+// and redirects them to Tensorflow allocator
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
+
+#ifdef INTEL_MKL
+
+#include <string>
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/mem.h"
+
+#include "third_party/mkl/include/i_malloc.h"
+
+namespace tensorflow {
+
+class MklSubAllocator : public SubAllocator {
+ public:
+  ~MklSubAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    return port::AlignedMalloc(num_bytes, alignment);
+  }
+  void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
+};
+
+/// CPU allocator for MKL that wraps BFC allocator and intercepts
+/// and redirects memory allocation calls from MKL.
+class MklCPUAllocator : public Allocator {
+ public:
+  // Constructor and other standard functions
+
+  MklCPUAllocator() {
+    VLOG(2) << "MklCPUAllocator: In MklCPUAllocator";
+    allocator_ =
+        new BFCAllocator(new MklSubAllocator, kMaxMemSize, kAllowGrowth, kName);
+
+    // For redirecting all allocations from MKL to this allocator
+    // From: http://software.intel.com/en-us/node/528565
+    i_malloc = MallocHook;
+    i_calloc = CallocHook;
+    i_realloc = ReallocHook;
+    i_free = FreeHook;
+  }
+
+  ~MklCPUAllocator() override { delete allocator_; }
+
+  inline string Name() override { return kName; }
+
+  inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return allocator_->AllocateRaw(alignment, num_bytes);
+  }
+
+  inline void DeallocateRaw(void* ptr) override {
+    allocator_->DeallocateRaw(ptr);
+  }
+
+ private:
+  // Hooks provided by this allocator for memory allocation routines from MKL
+
+  static inline void* MallocHook(size_t size) {
+    VLOG(2) << "MklCPUAllocator: In MallocHook";
+    return cpu_allocator()->AllocateRaw(kAlignment, size);
+  }
+
+  static inline void FreeHook(void* ptr) {
+    VLOG(2) << "MklCPUAllocator: In FreeHook";
+    cpu_allocator()->DeallocateRaw(ptr);
+  }
+
+  static inline void* CallocHook(size_t num, size_t size) {
+    Status s = Status(error::Code::UNIMPLEMENTED,
+                      "Unimplemented case for hooking MKL function.");
+    TF_CHECK_OK(s);  // way to assert with an error message
+  }
+
+  static inline void* ReallocHook(void* ptr, size_t size) {
+    Status s = Status(error::Code::UNIMPLEMENTED,
+                      "Unimplemented case for hooking MKL function.");
+    TF_CHECK_OK(s);  // way to assert with an error message
+  }
+
+  // TODO(jbobba): We should ideally move this into CPUOptions in config.proto.
+  /// Memory limit - 64GB
+  static const size_t kMaxMemSize =
+      static_cast<size_t>(64) * 1024 * 1024 * 1024;
+
+  /// Do we allow growth in BFC Allocator
+  static const bool kAllowGrowth = true;
+
+  /// Name
+  static constexpr const char* kName = "mklcpu";
+
+  /// The alignment that we need for the allocations
+  static const size_t kAlignment = 64;
+
+  Allocator* allocator_;  // owned by this class
+};
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@ -17,6 +17,7 @@ limitations under the License.

 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
@ -27,6 +28,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"

+#ifdef INTEL_MKL
+#include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#endif
+
 namespace tensorflow {

 ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
@ -70,4 +75,8 @@ Status ThreadPoolDevice::MakeTensorFromProto(
                                 ProtoDebugString(tensor_proto));
 }

+#ifdef INTEL_MKL
+REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocator);
+#endif
+
 }  // namespace tensorflow
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@ -15,6 +15,7 @@ limitations under the License.

 #include "tensorflow/core/framework/allocator.h"

+#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@ -119,11 +120,13 @@ Allocator* MakeCpuAllocator() {
 }  // namespace

 Allocator* cpu_allocator() {
-  static Allocator* cpu_alloc = MakeCpuAllocator();
+  static Allocator* cpu_alloc = AllocatorRegistry::Global()->GetAllocator();
  if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) {
    cpu_alloc = new TrackingAllocator(cpu_alloc, true);
  }
  return cpu_alloc;
 }

+REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocator);
+
 }  // namespace tensorflow
--- a/tensorflow/core/framework/allocator_registry.cc
+++ b/tensorflow/core/framework/allocator_registry.cc
@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/framework/allocator_registry.h"
+#include "tensorflow/core/platform/logging.h"
+
+
+namespace tensorflow {
+
+// static
+AllocatorRegistry* AllocatorRegistry::Global() {
+  static AllocatorRegistry* global_allocator_registry = new AllocatorRegistry;
+  return global_allocator_registry;
+}
+
+bool AllocatorRegistry::CheckForDuplicates(const string& name, int priority) {
+  for (auto entry : allocators_) {
+    if (!name.compare(entry.name) && priority == entry.priority) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void AllocatorRegistry::Register(const string& name, int priority,
+                                 Allocator* allocator) {
+  CHECK(!name.empty()) << "Need a valid name for Allocator";
+  CHECK_GE(priority, 0) << "Priority needs to be non-negative";
+  CHECK(!CheckForDuplicates(name, priority)) << "Allocator with name: [" << name
+                                             << "] and priority: [" << priority
+                                             << "] already registered";
+
+  AllocatorRegistryEntry tmp_entry;
+  tmp_entry.name = name;
+  tmp_entry.priority = priority;
+  tmp_entry.allocator = allocator;
+
+  allocators_.push_back(tmp_entry);
+  int high_pri = -1;
+  for (auto entry : allocators_) {
+    if (high_pri < entry.priority) {
+      m_curr_allocator_ = entry.allocator;
+      high_pri = entry.priority;
+    }
+  }
+}
+
+Allocator* AllocatorRegistry::GetAllocator() {
+  return CHECK_NOTNULL(m_curr_allocator_);
+}
+
+}  // namespace tensorflow
--- a/tensorflow/core/framework/allocator_registry.h
+++ b/tensorflow/core/framework/allocator_registry.h
@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Classes to maintain a static registry of memory allocators
+#ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+
+namespace tensorflow {
+
+// A global AllocatorRegistry is used to hold allocators for CPU backends
+class AllocatorRegistry {
+ public:
+  // Add an allocator to the registry.
+  void Register(const string& name, int priority, Allocator* allocator);
+
+  // Return allocator with highest priority
+  // If multiple allocators have the same high priority, return one of them
+  Allocator* GetAllocator();
+
+  // Returns the global registry of allocators.
+  static AllocatorRegistry* Global();
+
+ private:
+  typedef struct {
+    string name;
+    int priority;
+    Allocator* allocator;  // not owned
+  } AllocatorRegistryEntry;
+
+  bool CheckForDuplicates(const string& name, int priority);
+
+  std::vector<AllocatorRegistryEntry> allocators_;
+  Allocator* m_curr_allocator_;  // not owned
+};
+
+namespace allocator_registration {
+
+class AllocatorRegistration {
+ public:
+  AllocatorRegistration(const string& name, int priority,
+                        Allocator* allocator) {
+    AllocatorRegistry::Global()->Register(name, priority, allocator);
+  }
+};
+
+}  // namespace allocator_registration
+
+#define REGISTER_MEM_ALLOCATOR(name, priority, allocator) \
+  REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, name, priority, allocator)
+
+#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, name, priority, allocator) \
+  REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator)
+
+#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator) \
+  static allocator_registration::AllocatorRegistration              \
+      register_allocator_##ctr(name, priority, new allocator)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_TYPE_INDEX_H_

 #include <string>
-#ifdef __GXX_RTTI
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
 #include <typeindex>
 #include <typeinfo>
 #endif  // __GXX_RTTI
@ -30,7 +30,7 @@ namespace tensorflow {
 // binary sizes. The following #ifdef section provides a non-RTTI
 // replacement for std::type_index (with a minimal set of functions needed by
 // the TensorFlow framework, and more can be added if necessary).
-#ifndef __GXX_RTTI
+#if !defined(__GXX_RTTI) && !defined(_CPPRTTI)

 // A thin TypeIndex class that mimics std::type_index but does not use RTTI. As
 // a result, it does not provide the actual name of the type, and only returns a
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@ -68,9 +68,9 @@ class DeviceType {
 std::ostream& operator<<(std::ostream& os, const DeviceType& d);

 // Convenient constants that can be passed to a DeviceType constructor
-extern const char* const DEVICE_CPU;   // "CPU"
-extern const char* const DEVICE_GPU;   // "GPU"
-extern const char* const DEVICE_SYCL;  // "SYCL"
+TF_EXPORT extern const char* const DEVICE_CPU;   // "CPU"
+TF_EXPORT extern const char* const DEVICE_GPU;   // "GPU"
+TF_EXPORT extern const char* const DEVICE_SYCL;  // "SYCL"

 typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
 typedef gtl::ArraySlice<MemoryType> MemoryTypeSlice;
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@ -0,0 +1,548 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <vector>
+#include <utility>
+#include <string>
+#include <memory>
+#include <unordered_set>
+#include <functional>
+
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+#include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+// This pass implements rewriting of graph for propagating Mkl
+// layout as an additional output tensor (we will loosely call a
+// tensor that carries Mkl layout as Mkl tensor henceforth.)
+// from every Mkl supported NN layer.
+//
+// As a example, consider Relu layer. Current definition of Relu
+// layer looks like:
+//
+//           O = Relu(A)
+//
+// Relu has 1 input (A), and 1 output (O).
+//
+// This rewrite pass will generate a new graph node for Relu
+// (new node is called MklRelu) as:
+//
+//          O, O_m = MklRelu(A, A_m)
+//
+// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m).
+// Here A input is same as A input of Relu; O output is same
+// as O output of Relu. O_m is the additional output tensor
+// that will be set by MklRelu, and it represents Mkl tensor
+// corresponding to O -- in other words, O_m is some kind of
+// metadata for O. A_m is additional input of Relu, and it
+// represents metadata for A - as O_m is metadata for O, A_m
+// is metadata for A. MklRelu receives this metadata from
+// previous layer (in the graph).
+//
+// When previous layer in the graph is Mkl layer, A_m will
+// represent a valid Mkl tensor. But when previous Mkl layer
+// is not an Mkl layer, then A_m represents a dummy Mkl tensor.
+//
+// Rewriting rules:
+//   - Selection of an op for rewriting happens by registering
+//     an op with this pass. If an op is not registered, then
+//     it is not rewritten.
+//  - Number of inputs after rewriting:
+//      Since for every input Tensorflow tensor, the rewritten
+//      layer gets Mkl tensor, rewritten op gets 2*N inputs,
+//      where N is the number of inputs for original op.
+//  - Number of outputs after rewriting:
+//      Since for every output Tensorflow tensor, the rewritten
+//      layer generates Mkl tensor, rewritten op generates 2*N
+//      outputs, where N is the number of outputs of original op.
+//  - Ordering of Tensorflow tensors and Mkl tensors:
+//      Since every op generates twice the number of inputs and
+//      outputs, one could imagine different ordering among
+//      Tensorflow tensors and Mkl tensors. E.g., let's assume
+//      an op 'Conv2D' takes (A, B) as input, then new op
+//      'MklConv2D' can take (A, A_m, B, B_m) as input or it
+//      can also take (A, B, A_m, B_m) as input. Among N inputs
+//      one can get N! permutations.
+//
+//      So the question is: which one do we follow? Currently,
+//      we follow an intuitive order where Mkl tensor follows a
+//      corresponding Tensorflow tensor immediately. In the
+//      context of above example, it will be: (A, A_m, B, B_m).
+//      We follow same ordering rule for output tensors.
+//
+// NOTE: Current rewriting approach rewrites an op to Mkl op without
+//      any conditions. But in the future, it may be possible to
+//      consider conditions such as input shapes and sizes to rewrite
+//      an op.
+//
+// Graph rewrite algorithm:
+//      Algorithm: Graph Rewrite
+//      Input: Graph G, Names of nodes to rewrite and their new nodes
+//      Output: Modified Graph G' if nodes are modified, G otherwise.
+//      Start:
+//        N = Topological_Sort(G) // N is set of nodes in toposort order.
+//        foreach node n in N
+//        do
+//          if (Is_MKL_Layer(n))  // Can this layer accept Mkl layout as input.
+//          then
+//            E = set of <incoming edge and its src_output slot> of n
+//            E' = {}   // new set of edges for rewritten node
+//            foreach <e,s> in E
+//            do
+//              E' U {<e,s>}  // First copy edge which generates Tensorflow
+//                            // tensor as it is
+//              m = Source node of edge e
+//              if Is_Rewritten(m)  // Did we rewrite this node in this pass?
+//              then
+//                E' U {<m,s+1>}    // If yes, then m will generate Mkl tensor
+//                                  // as output.
+//              else
+//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate dummy
+//                                                 // Mkl tensor.
+//                E' U {<d,0>}   // Dummy Mkl tensor has only 1 output slot.
+//              fi
+//            done
+//            n' = Build_New_Node(G,new_name,E')
+//            Mark_Rewritten(n')  // Mark new node as being rewritten.
+//          fi
+//        done
+//
+//      Explanation:
+//        For graph rewrite, we visit nodes of the graph in the topological
+//        sort order. With this ordering, we visit nodes in top-to-bottom
+//        fashion. We need this order because while visiting a node we want
+//        all of its input nodes (parents) visited (and rewritten if
+//        applicable). This is because if we need to rewrite a current node
+//        then all of its input nodes need to be fixed (in other words they
+//        cannot be removed later.)
+//
+//        While visiting each node, we first check if it is Mkl layer. If
+//        it is, then we rewrite that node after constructing new inputs to
+//        the node. If it is not Mkl layer, then we do not rewrite the node.
+//
+class MklLayoutRewritePass : public GraphOptimizationPass {
+ public:
+  MklLayoutRewritePass() {
+    csinfo_.conv2d          = "Conv2D";
+
+    ninfo_.push_back({csinfo_.conv2d,   GetMklOpName(csinfo_.conv2d),
+                      2, CopyAttrsConv2D});
+  }
+
+  // Standard interface to run pass
+  Status Run(const GraphOptimizationPassOptions& options);
+
+  // Helper function which does most of heavy lifting for rewriting
+  // Mkl nodes to propagate Mkl tensor as additional output
+  //
+  // Extracts common functionality between Run public interface and
+  // test interface.
+  //
+  // @return true, if and only if graph is mutated; false otherwise.
+  bool RunPass(std::unique_ptr<Graph>* g);
+
+ private:
+  /// Structure to specify name of original op, its new name after rewrite,
+  /// the number of inputs to the original op, and the function to be used
+  /// to copy attributes for the op
+  typedef struct {
+    string name;   // Original name of the op in the graph
+    string newname;   // New name of op in the graph
+    int    numins;  // Number of inputs to the original op
+    std::function<void(Node*, NodeBuilder*)> copyattrs;  // Function handler
+                    // to copy attributes from old node to new node.
+  } NodesInfo;
+
+  /// Structure to store all constant strings
+  struct {
+    string relu;
+    string relugrad;
+    string conv2d;
+  } csinfo_;
+
+  /// Maintain info about nodes to rewrite
+  std::vector<NodesInfo> ninfo_;
+
+  /// Hash table to maintain nodes visited in the graph.
+  std::unordered_set<const Node*> visited_nodes_;
+
+ private:
+  // Predicate to check if we rewrote node 'n'
+  //
+  // If we rewrote the node, then the rewritten node will produce
+  // Mkl tensor as output. If we did not rewrite the node, then
+  // we need to insert dummy Mkl node on the input side.
+  //
+  // Returns true if node is rewritten, false otherwise.
+  inline bool IsRewrittenNode(Node* n) const {
+    return visited_nodes_.find(n) != visited_nodes_.end();
+  }
+
+  // Mark the node as rewritten
+  inline void MarkRewrittenNode(Node* n) {
+    visited_nodes_.insert(n);
+  }
+
+  // Get the name of Mkl op from original TensorFlow op
+  // We prefix 'Mkl' to the original op to get Mkl op.
+  // TODO(nhasabni) We should move this to mkl_util.h.
+  inline string GetMklOpName(const string& name) const {
+    // Prefix that we add to Tensorflow op name to construct Mkl op name.
+    const char* const kMklOpPrefix = "Mkl";
+    return string(kMklOpPrefix) + name;
+  }
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'orign'.
+  //
+  // For details, refer to 'Number of inputs after rewriting' section in the
+  // documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  Status SetUpInputs(std::unique_ptr<Graph>* g,
+                    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+                    NodeBuilder* nb, Node* orign);
+
+  // Rewrite Node 'n' in graph 'g' with rewrite information specified in 'ni'
+  // Returns Status::OK() if node rewrite is successful, otherwise returns
+  // appropriate error status
+  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const NodesInfo& ni);
+
+  // Functions specific to operators to copy attributes
+  // We need operator-specific function to copy attributes because the framework
+  // does not provide any generic function for it.
+  static void CopyAttrsConv2D(Node* orign, NodeBuilder* nb);
+
+  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
+  // using node for original node 'orign' and return it in '*out'.
+  // TODO(nhasabni) We should move this to mkl_util.h
+  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                             Node* orign);
+};
+
+
+// We register Mkl rewrite pass for phase 1 in pre-placement group.
+// Do not change the ordering of the Mkl passes.
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
+                      MklLayoutRewritePass);
+
+
+static void FillInputs(const Node* n,
+                       gtl::InlinedVector<Node*, 4>* control_edges,
+                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
+  DCHECK_EQ(in->size(), n->num_inputs());
+  control_edges->clear();
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      control_edges->push_back(e->src());
+    } else {
+      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
+    }
+  }
+  std::sort(control_edges->begin(), control_edges->end());
+  if (n->op_def().is_commutative()) {
+    // For commutative inputs, we sort the input by the input Node*
+    // to get a canonical ordering (so that add(a,b) and add(b, a) will
+    // hash to the same value if is_commutative is true for 'add').
+    std::sort(in->begin(), in->end());
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+// Macros to build new node with different number of inputs.
+// We need this way because we need to specify all the inputs when
+// building a node. Comment at core/graph/node_builder.h, line 85-86.
+
+#define SETUP_INPUTS1(nb, op1) do {        \
+  nb->Input(op1.node, op1.index);          \
+}while(0)
+
+#define SETUP_INPUTS2(nb, op1, op2) do {   \
+  nb->Input(op1.node, op1.index);          \
+  nb->Input(op2.node, op2.index);          \
+}while(0)
+
+#define SETUP_INPUTS3(nb, op1, op2, op3) do {      \
+  nb->Input(op1.node, op1.index);          \
+  nb->Input(op2.node, op2.index);          \
+  nb->Input(op3.node, op3.index);          \
+}while(0)
+
+#define SETUP_INPUTS4(nb, op1, op2, op3, op4) do {  \
+  nb->Input(op1.node, op1.index);          \
+  nb->Input(op2.node, op2.index);          \
+  nb->Input(op3.node, op3.index);          \
+  nb->Input(op4.node, op4.index);          \
+}while(0)
+
+#define SETUP_INPUTS5(nb, op1, op2, op3, op4, op5) do {\
+  nb->Input(op1.node, op1.index);          \
+  nb->Input(op2.node, op2.index);          \
+  nb->Input(op3.node, op3.index);          \
+  nb->Input(op4.node, op4.index);          \
+  nb->Input(op5.node, op5.index);          \
+}while(0)
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyMklTensorNode(
+    std::unique_ptr<Graph>* g, Node** out, Node* orign) {
+  // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
+  // dummy Mkl tensor. 8 = 2*size_t.
+  const DataType dt = DataTypeToEnum<uint8>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  proto.set_tensor_content(const_cast<const void*>(
+      static_cast<void*>(&zero)), 8);
+  TensorShape dummy_shape({8});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+                 .Attr("value", proto)
+                 .Attr("dtype", dt)
+                 .Device(orign->def().device())  // We place this node on same
+                                             // device as device of original
+                                             // node.
+                 .Finalize(&**g, out));
+}
+
+Status MklLayoutRewritePass::SetUpInputs(std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    NodeBuilder* nb, Node* orign) {
+  std::vector<NodeBuilder::NodeOut> new_inputs;
+
+  // 1. Let's setup inputs for the new node.
+  for (int i = 0; i < inputs.size(); i++) {
+    Node* n = inputs[i].first;
+    // First let's copy original TF tensor input as it is.
+    new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second));
+
+    // Second, let's add edge to propagate Mkl tensors from input Mkl layers,
+    // or generate a dummy Mkl tensor representing not-mkl-tensor case.
+    if (IsRewrittenNode(n)) {
+      // If we have visited this node and rewritten it, then it will generate
+      // an edge that will receive Mkl tensor from a node.
+      // First, let's assert that this op is Mkl layer.
+      DataType T;
+      TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
+      // If this op has been rewritten, then its name must have been same as
+      // Mkl op.
+      CHECK_EQ(mkl_layer_registry::IsMklLayer(n->type_string()), true);
+      // src slot number for Mkl tensor would be the one next to TF tensor
+      // slot number.
+      new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second+1));
+    } else {
+      // If we have not visited the node and rewritten it, then we need
+      // to create a dummy node that will feed a non-Mkl tensor to this node.
+      // DummyMklTensor node has no input and generates only 1 output
+      // (dummy Mkl tensor) as output slot number 0.
+      Node* dmt = nullptr;
+      GetDummyMklTensorNode(g, &dmt, orign);
+      CHECK_NOTNULL(dmt);
+      new_inputs.push_back(NodeBuilder::NodeOut(dmt, 0));
+    }
+  }
+
+  // The total number of inputs to new node _must_ be 2 times the number
+  // of inputs to the original node: N original Tensorflow tensors and
+  // N for Mkl tensors corresponding to each Tensorflow tensors.
+  CHECK_EQ(new_inputs.size(), inputs.size() * 2);
+
+  // 2. Let's build the node with new inputs.
+  switch (new_inputs.size()) {
+    case 0:  // We don't need to do anything for no input as we have
+             // already built node.
+            break;
+    case 1: SETUP_INPUTS1(nb, new_inputs[0]); break;
+    case 2: SETUP_INPUTS2(nb, new_inputs[0],
+                              new_inputs[1]); break;
+    case 3: SETUP_INPUTS3(nb, new_inputs[0],
+                              new_inputs[1],
+                              new_inputs[2]); break;
+    case 4: SETUP_INPUTS4(nb, new_inputs[0],
+                              new_inputs[1],
+                              new_inputs[2],
+                              new_inputs[3]); break;
+    case 5: SETUP_INPUTS5(nb, new_inputs[0],
+                              new_inputs[1],
+                              new_inputs[2],
+                              new_inputs[3],
+                              new_inputs[4]); break;
+    default: {
+      return Status(error::Code::UNIMPLEMENTED,
+                    "Could not create node with given number of inputs");
+    }
+  }
+
+  return Status::OK();
+}
+
+void MklLayoutRewritePass::CopyAttrsConv2D(Node* orign, NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  bool use_cudnn_on_gpu;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orign->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+}
+
+Status MklLayoutRewritePass::RewriteNode(
+    std::unique_ptr<Graph>* g, Node* orign, const NodesInfo& ni) {
+  VLOG(1) << "MKLLayoutRewritePass: Original node:" << orign->DebugString();
+
+  // Get all inputs.
+  const int num = orign->num_inputs();
+  CHECK_EQ(num, ni.numins);
+  gtl::InlinedVector<Node*, 4> control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num);
+  FillInputs(orign, &control_edges, &inputs);
+
+  // Build new node. We use same name as original node, but change the op name.
+  NodeBuilder nb(orign->name().c_str(), ni.newname.c_str());
+  // Copy user-specified device assigned to original node to new node.
+  nb.Device(orign->def().device());
+  // Set up new inputs to the rewritten node.
+  Status s = SetUpInputs(g, inputs, &nb, orign);
+  if (s != Status::OK()) {
+    return s;
+  }
+  // Copy attributes from original node to new node.
+  ni.copyattrs(orign, &nb);
+  // Set the Mkl layer label for this op.
+  nb.Attr("_kernel", mkl_layer_registry::kMklLayerLabel);
+  Node* newn = nullptr;
+
+  // Finalize graph and get new node.
+  TF_CHECK_OK(nb.Finalize(&**g, &newn));
+  CHECK_NOTNULL(newn);
+
+  // Incoming edges from 'orign' node to new 'newn' node are already copied
+  // in BuildNode. Copy outgoing edges from 'orign' node to new 'newn' node.
+  for (const Edge* e : orign->out_edges()) {
+    (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
+  }
+
+  // Copy the runtime device assigned from original code to new node.
+  newn->set_assigned_device_name(orign->assigned_device_name());
+
+  // Delete original node and mark new node as rewritten.
+  (*g)->RemoveNode(orign);
+  MarkRewrittenNode(newn);
+
+  VLOG(1) << "MKLLayoutRewritePass: New node:" << newn->DebugString();
+  return Status::OK();
+}
+
+bool MklLayoutRewritePass::RunPass(
+    std::unique_ptr<Graph>* g) {
+  bool result = false;
+  CHECK_NOTNULL(g);
+
+  DumpGraph("Before running MklLayoutRewritePass", &**g);
+
+  std::vector<Node*> order;
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+
+  for (Node* n : order) {
+    if (!n->IsOp()) {
+      continue;
+    }
+
+    for (const NodesInfo& ni : ninfo_) {
+      DataType dtype = DT_INVALID;
+      // An op needs to have data type (T) attribute and its corresponding
+      // Mkl op name must be supported.
+      if (GetNodeAttr(n->def(), "T", &dtype) == Status::OK() &&
+          mkl_layer_registry::IsMklLayer(GetMklOpName(n->type_string())) &&
+          n->type_string().compare(ni.name) == 0) {
+        string node_name = n->name();
+        string op_name = n->type_string();
+
+        VLOG(1) << "MKLLayoutRewritePass: Scheduled node " << node_name
+                << " with op " << op_name << " for rewrite using"
+                << " layout optimization.";
+
+        if (RewriteNode(g, n, ni) == Status::OK()) {
+          VLOG(1) << "MKLLayoutRewritePass: Successfully rewrote node "
+                  << node_name << " with op " << op_name
+                  << " for Mkl layout optimization.";
+          result = true;
+          break;  // We found matching nodesinfo so no need to search next.
+        }
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass", &**g);
+
+  return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+///////////////////////////////////////////////////////////////////////////////
+
+bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
+  return MklLayoutRewritePass().RunPass(g);
+}
+
+Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr) {
+    return Status::OK();
+  }
+
+  // Get the ownership of graph
+  std::unique_ptr<Graph>* g = std::move(options.graph);
+
+  RunPass(g);
+
+  // Return the ownership of graph back
+  options.graph->reset(g->release());
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif
--- a/tensorflow/core/graph/mkl_layout_pass.h
+++ b/tensorflow/core/graph/mkl_layout_pass.h
@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A graph pass that rewrites graph for propagating MKL layout as a tensor
+
+#ifndef TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
+#define TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
+
+#ifdef INTEL_MKL
+
+#include <sys/types.h>
+#include <memory>
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Interface to invoke the pass for unit test
+//
+// Returns true if and only if 'g' is mutated.
+extern bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g);
+}  // namespace tensorflow
+
+#endif
+
+#endif  // TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@ -0,0 +1,199 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+#include <vector>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+static void InitGraph(const string& s, Graph* graph) {
+  GraphDef graph_def;
+
+  auto parser = protobuf::TextFormat::Parser();
+  //  parser.AllowRelaxedWhitespace(true);
+  CHECK(parser.MergeFromString(s, &graph_def)) << s;
+  GraphConstructorOptions opts;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+}
+
+class MklLayoutPassTest : public ::testing::Test {
+ public:
+  MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
+
+  void InitGraph(const string& s) {
+    ::tensorflow::InitGraph(s, &graph_);
+    original_ = CanonicalGraphString(&graph_);
+  }
+
+  static bool IncludeNode(const Node* n) { return n->IsOp(); }
+
+  static string EdgeId(const Node* n, int index) {
+    if (index == 0) {
+      return n->name();
+    } else if (index == Graph::kControlSlot) {
+      return strings::StrCat(n->name(), ":control");
+    } else {
+      return strings::StrCat(n->name(), ":", index);
+    }
+  }
+
+  string CanonicalGraphString(Graph* g) {
+    std::vector<string> nodes;
+    std::vector<string> edges;
+    for (const Node* n : g->nodes()) {
+      if (IncludeNode(n)) {
+        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
+      }
+    }
+    for (const Edge* e : g->edges()) {
+      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
+        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
+                                        EdgeId(e->dst(), e->dst_input())));
+      }
+    }
+    // Canonicalize
+    std::sort(nodes.begin(), nodes.end());
+    std::sort(edges.begin(), edges.end());
+    return strings::StrCat(str_util::Join(nodes, ";"), "|",
+                           str_util::Join(edges, ";"));
+  }
+
+  string DoMklLayoutOptimizationPass() {
+    string before = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "Before MKL layout rewrite pass: " << before;
+
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
+    RunMklLayoutRewritePass(ug);
+
+    string result = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "After MKL layout rewrite pass:  " << result;
+    return result;
+  }
+
+  const string& OriginalGraph() const { return original_; }
+
+  Graph graph_;
+  string original_;
+};
+
+REGISTER_OP("Input").Output("o: float").SetIsStateful();
+
+// Single Conv2D Op; No Mkl layer on the input and on the output.
+// We will generate dummy Mkl tensor as 2nd input of Conv2D.
+TEST_F(MklLayoutPassTest, Conv2D_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Mul);DMT/_0(Const);DMT/_1(Const)|"
+            "A->C;B->C:2;B->D;C->D:1;DMT/_0->C:1;DMT/_1->C:3");
+}
+
+// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
+// have 2 outputs, both of which will be inputs to next Conv2D.
+TEST_F(MklLayoutPassTest, Conv2D_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MklConv2D);D(MklConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;B->C:2;C->D:2;C->E;"
+            "C:1->D:3;D->E:1;DMT/_0->C:1;DMT/_1->C:3;DMT/_2->D:1");
+}
+
+static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
+  testing::StopTiming();
+  string s;
+  for (int in = 0; in < 10; in++) {
+    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+  }
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < op_nodes; op++) {
+    s += strings::Printf(
+        "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
+        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
+        op, rnd.Uniform(10), rnd.Uniform(10));
+  }
+
+  bool first = true;
+  while (iters > 0) {
+    Graph* graph = new Graph(OpRegistry::Global());
+    InitGraph(s, graph);
+    int N = graph->num_node_ids();
+    if (first) {
+      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
+      first = false;
+    }
+    {
+      testing::StartTiming();
+      std::unique_ptr<Graph> ug(graph);
+      RunMklLayoutRewritePass(&ug);
+      testing::StopTiming();
+    }
+    iters -= N;  // Our benchmark units are individual graph nodes,
+                 // not whole graphs
+    // delete graph;
+  }
+}
+BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif /* INTEL_MKL */
--- a/tensorflow/core/graph/mkl_optimizer_merge.cc
+++ b/tensorflow/core/graph/mkl_optimizer_merge.cc
@ -22,6 +22,8 @@ limitations under the License.
 #include <vector>
 #include <queue>
 #include <utility>
+#include <string>
+#include <memory>

 #include "tensorflow/core/graph/mkl_optimizer_merge.h"

@ -33,6 +35,8 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"

 namespace tensorflow {

@ -58,8 +62,8 @@ static size_t kNodeMergeContextMaxDepth = 10;
 class NodeMergeRewritePass : public GraphOptimizationPass {
 public:
  NodeMergeRewritePass() {
-    csinfo_.conv2d                     = "Conv2D";
-    csinfo_.conv2dwithbias             = "Conv2DWithBias";
+    csinfo_.conv2d                     = "MklConv2D";
+    csinfo_.conv2dwithbias             = "MklConv2DWithBias";
    csinfo_.conv2dwithbiasbackpropbias = "Conv2DWithBiasBackpropBias";
    csinfo_.biasadd                    = "BiasAdd";
    csinfo_.matmul                     = "MatMul";
@ -72,6 +76,9 @@ class NodeMergeRewritePass : public GraphOptimizationPass {
    // maxhops in backward data-flow graph. Since input of forward nodes
    // (Conv2D) directly goes to backward nodes, we do not expect the
    // hop-distance would be more than few nodes.
+    // TODO(nhasabni) Temporarily disabling rewrite of BiasAddGrad.
+    // Will enable it once we support Conv2DWithBiasBackpropBias op.
+#if 0
    rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias,
                  {csinfo_.conv2dwithbias, kNodeMergeContextMaxDepth}});
    rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias,
@ -80,6 +87,7 @@ class NodeMergeRewritePass : public GraphOptimizationPass {
    // because we do not have a separate Op for MatMulwithBias.
    rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.biasaddgrad,
                      {csinfo_.matmul, kNodeMergeContextMaxDepth}});
+#endif
  }

  // Standard interface to run optimization pass
@ -182,10 +190,16 @@ class NodeMergeRewritePass : public GraphOptimizationPass {
  // @return Matching rewriteinfo in case a match is found; null otherwise.
  const RewriteInfo* FindMatchingRewriteInfo(const Node* n,
                                             const Node** fwdn) const;
+
+  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
+  // and return it in '*out'.
+  // TODO(nhasabni) We should move this to mkl_util.h
+  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out);
 };

-/// We register merge optimizer for phase 1 and MKLToTF insertion for phase 2.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
+// We register merge optimizer for phase 2 in pre-placement group.
+// Do not change the ordering of the Mkl passes.
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 2,
                      NodeMergeRewritePass);

 static void FillInputs(const Node* n,
@ -219,8 +233,6 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
    }
  }

-  VLOG(1) << "FindNodeForMerge: " << a->type_string();
-
  for (const MergeInfo* mi : matching_mi) {
    const int N_in = a->num_inputs();
    if (mi->op >= N_in) {
@ -240,8 +252,6 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
      continue;
    }

-    VLOG(1) << "     FindNode: " << b->type_string();
-
    gtl::InlinedVector<Node*, 4> b_control_edges;
    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(N_in);
    FillInputs(b, &b_control_edges, &b_in);
@ -258,6 +268,22 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
  return nullptr;
 }

+void NodeMergeRewritePass::GetDummyMklTensorNode(
+    std::unique_ptr<Graph>* g, Node** out) {
+  const DataType dt = DataTypeToEnum<uint8>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  proto.set_tensor_content(const_cast<const void*>(
+      static_cast<void*>(&zero)), 8);
+  TensorShape dummy_shape({8});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+                 .Attr("value", proto)
+                 .Attr("dtype", dt)
+                 .Finalize(&**g, out));
+}
+
 Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
                                     Node* succ, Node* pred) {
  CHECK_NOTNULL(succ);
@ -271,7 +297,6 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
    std::vector<int32> strides;
    string data_format_pred, data_format_succ;
    bool use_cudnn_on_gnu;
-    int groups = 1;
    TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
    TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
    TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
@ -280,25 +305,28 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
    TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
    TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu",
                            &use_cudnn_on_gnu));
-    // Groups attribute may not be there on the input node. So we do not
-    // check for error in GetNodeAttr call.
-    GetNodeAttr(pred->def(), "groups", &groups);
    // We check to ensure that data formats of both succ and pred are same.
    // We expect them to be same, so we can enforce this as assert.
    // But assert can be too strict, so we enforce this as a check.
    // If the check fails, then we do not merge two nodes.
+    // We also do same check for devices.
    if (data_format_pred != data_format_succ ||
-        T_pred != T_succ) {
+        T_pred != T_succ ||
+        pred->assigned_device_name() != succ->assigned_device_name() ||
+        pred->def().device() != succ->def().device()) {
      return Status(error::Code::INVALID_ARGUMENT,
-                    "data_format or T attribute of Conv2D and BiasAdd"
-                    "do not match. Will skip node merge optimization");
+                    "data_format or T attribute or devices of Conv2D and "
+                    "BiasAdd do not match. Will skip node merge optimization");
    }

    // 2. Get inputs from both the nodes.
    // Find the 2 inputs from the conv and the bias from the add Bias.
    Node* oper1 = nullptr;
+    Node* oper1_mkl = nullptr;  // Mkl tensor corresponding to oper1
    Node* oper2 = nullptr;
+    Node* oper2_mkl = nullptr;  // Mkl tensor corresponding to oper2
    Node* oper3 = nullptr;
+    Node* oper3_mkl = nullptr;  // Mkl tensor corresponding to oper3

    const int succ_num = succ->num_inputs();
    gtl::InlinedVector<Node*, 4> succ_control_edges;
@ -326,24 +354,35 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
      }
    }

-    // Get operand 0, 1 of conv2D
+    // Get operand 0, 1 of conv2D and their Mkl tensors.
+    CHECK_EQ(pred->in_edges().size(), 4);  // MklConv2D must have 4 inputs.
    oper1     = pred_in[0].first;
-    oper2 = pred_in[1].first;
+    oper1_mkl = pred_in[1].first;
+    oper2     = pred_in[2].first;
+    oper2_mkl = pred_in[3].first;
    // Get operand 1 of add_bias
+    // BiasAdd must have 2 inputs: Conv, bias
+    CHECK_EQ(succ->in_edges().size(), 2);
    oper3     = succ_in[1].first;
+    GetDummyMklTensorNode(g, &oper3_mkl);  // Get dummy Mkl tensor node
+    // as BiasAdd does not have Mkl tensor as input.
+    CHECK_NOTNULL(oper3_mkl);

    Node* ret;
    // We will use the node name of BiasAdd as the name of new node
    TF_CHECK_OK(NodeBuilder(succ->name(), csinfo_.conv2dwithbias)
                  .Input(oper1)
+                  .Input(oper1_mkl)
                  .Input(oper2)
+                  .Input(oper2_mkl)
                  .Input(oper3)
+                  .Input(oper3_mkl)
                  .Attr("T", T_pred)
                  .Attr("strides", strides)
                  .Attr("padding", padding)
                  .Attr("data_format", data_format_pred)
                  .Attr("use_cudnn_on_gpu", use_cudnn_on_gnu)
-                  .Attr("groups", groups)
+                  .Device(succ->def().device())
                  .Finalize(&**g, &ret));
    CHECK_NOTNULL(ret);

@ -352,6 +391,15 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g,
      (*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input());
    }

+    // Copy device assigned to old node to new node.
+    // It's ok to use pred or succ as we have enforced a check that
+    // both have same device assigned.
+    ret->set_assigned_device_name(pred->assigned_device_name());
+
+    VLOG(1) << "NodeMergeRewritePass: Merged old node:" << pred->DebugString()
+            << ", and node: " << succ->DebugString() << ", into node:"
+            << ret->DebugString();
+
    (*g)->RemoveNode(succ);
    (*g)->RemoveNode(pred);

@ -369,13 +417,14 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
  const Node* fwdn = nullptr;
  const RewriteInfo* ri = FindMatchingRewriteInfo(n, &fwdn);
  if (ri == nullptr || fwdn == nullptr) {
-    VLOG(1) << "Rewriteinfo not found for: " << n->type_string();
+    VLOG(2) << "NodeMergeRewritePass: Rewriteinfo not found for: "
+            << n->type_string();
    return Status(error::Code::INVALID_ARGUMENT,
                  "Rewrite info not found for the node."
                  "Will skip node rewrite optimization");
  }

-  VLOG(1) << "Rewrite called for: " << n->type_string();
+  VLOG(1) << "NodeMergeRewritePass: Rewrite called for: " << n->type_string();

  if (n->type_string() == csinfo_.biasaddgrad &&
      ri->node         == csinfo_.biasaddgrad &&
@ -407,6 +456,7 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
                    .Attr("T", T)
                    .Attr("data_format", data_format)
                    .Attr("strides", strides)
+                    .Device(n->def().device())
                    .Finalize(&**g, &ret));
    } else {
      CHECK_EQ(ri->rewrite, csinfo_.biasaddgrad);
@ -414,6 +464,7 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
                    .Input(op)
                    .Attr("T", T)
                    .Attr("data_format", data_format)
+                    .Device(n->def().device())
                    .Finalize(&**g, &ret));
    }

@ -424,7 +475,11 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node *n) {
      (*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input());
    }

-    VLOG(1) << "Rewrite node: " << n->type_string() << " successful";
+    // Copy device assigned to old node to new node.
+    ret->set_assigned_device_name(n->assigned_device_name());
+
+    VLOG(1) << "MKLOptimizerMergePass: Rewrote old node:" << n->DebugString()
+            << ", into node:" << ret->DebugString();
    (*g)->RemoveNode(n);

    return Status::OK();
@ -450,7 +505,8 @@ NodeMergeRewritePass::FindMatchingRewriteInfo(const Node* n,
    }
  }

-  VLOG(1) << "Searching graph for: " << n->type_string() << " in backwards.";
+  VLOG(1) << "NodeMergeRewritePass: Searching graph for: "
+          << n->type_string() << " in backwards.";

  // Now we will check for forward op name for rewrite info in data
  // flow graph. Get the max hops we should search for the fwd node
@ -473,7 +529,8 @@ NodeMergeRewritePass::FindMatchingRewriteInfo(const Node* n,
    curr_depth = curr_pair.second;
    CHECK_NOTNULL(curr_node);

-    VLOG(1) << "Visiting node: " << curr_node->type_string()
+    VLOG(1) << "NodeMergeRewritePass: Visiting node: "
+            << curr_node->type_string()
            << " at depth: " << curr_depth
            << " for node: " << n->type_string();

@ -528,17 +585,16 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
  std::vector<std::pair<Node*, Node*>> nodes_to_be_merged;
  std::vector<Node*> nodes_to_be_rewritten;

-  VLOG(1) << "Running NodeMerge Optimization";
-
  for (Node* n : order) {
    if (!n->IsOp()) continue;
    Node* n1 = nullptr;
    if ((n1 = FindNodeForMerge(n)) != nullptr) {
-      VLOG(1) << "Scheduled nodes " << n->name() << " and "
-              << n1->name() << " for merging";
+      VLOG(1) << "NodeMergeRewritePass: Scheduled nodes "
+              << n->name() << " and " << n1->name() << " for merging";
      nodes_to_be_merged.push_back(std::make_pair(n, n1));
    } else if (IsApplicableRewriteNode(n)) {
-      VLOG(1) << "Scheduled node " << n->name() << " for rewrite";
+      VLOG(1) << "NodeMergeRewritePass: Scheduled node " << n->name()
+              << " for rewrite";
      nodes_to_be_rewritten.push_back(n);
    }
  }
@ -549,7 +605,8 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
    string n1_name = i.first->name();
    string n2_name = i.second->name();
    if (MergeNode(g, i.first, i.second) == Status::OK()) {
-      VLOG(1) << "Merged nodes " << n1_name << " and " << n2_name;
+      VLOG(1) << "NodeMergeRewritePass: Merged nodes " << n1_name
+              << " and " << n2_name;
      result = true;
    }
  }
@ -559,7 +616,8 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
  for (Node* i : nodes_to_be_rewritten) {
    string name = i->name();
    if (RewriteNode(g, i) == Status::OK()) {
-      VLOG(1) << "Rewrite node: " << name << " successful.";
+      VLOG(1) << "NodeMergeRewritePass: Rewrite node: "
+              << name << " successful.";
      result = true;
    }
  }
@ -574,8 +632,6 @@ bool OptimizeNodeMerge(std::unique_ptr<Graph>* g) {
 }

 Status NodeMergeRewritePass::Run(const GraphOptimizationPassOptions& options) {
-  // Currently checking only for two cases - Conv2D+Bias and Matmul+Bias.
-  // It is possible to extend it to other operators in future.
  if (options.graph == nullptr) {
    return Status::OK();
  }
--- a/tensorflow/core/graph/mkl_optimizer_merge.h
+++ b/tensorflow/core/graph/mkl_optimizer_merge.h
@ -21,20 +21,14 @@ limitations under the License.
 #ifdef INTEL_MKL

 #include <sys/types.h>
-#include <vector>
-#include <string>
 #include <memory>
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"

 namespace tensorflow {
-
 // Interface to invoke the pass for unit test
 //
 // Returns true if and only if 'g' is mutated.
 extern bool OptimizeNodeMerge(std::unique_ptr<Graph>* g);
-
 }  // namespace tensorflow

 #endif  // INTEL_MKL
--- a/tensorflow/core/graph/mkl_optimizer_merge_test.cc
+++ b/tensorflow/core/graph/mkl_optimizer_merge_test.cc
@ -105,6 +105,7 @@ class OptimizerMergeTest : public ::testing::Test {
 };

 REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();

 TEST_F(OptimizerMergeTest, Basic) {
  InitGraph(
@ -121,8 +122,38 @@ TEST_F(OptimizerMergeTest, Basic) {

 // Test set 1: Conv2D + AddBias

-// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y)
+// C=MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y)
 TEST_F(OptimizerMergeTest, Conv2DWithBias_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoNodeMerge(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
+            "M(MklInput);N(MklInput);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
+            "DMT/_0->E:5;E->Z;M->E:1;N->E:3;Y->Z:1");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
+// We do not merge in this case as op is Conv2D and not MklConv2D.
+TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoMklConv2D) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
      "node { name: 'B' op: 'Input'}"
@ -143,63 +174,69 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Positive) {
      " attr {key: 'T'                 value { type: DT_FLOAT } }"
      " input: ['E', 'Y']}");
  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);D(Input);E(Conv2DWithBias);Y(Input);Z(Sub)|"
-             "A->E;B->E:1;D->E:2;E->Z;Y->Z:1");
+            "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|"
+             "A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1");
 }

-// Graph contains only Conv2D, no AddBias.
+// Graph contains only MklConv2D, no AddBias.
 TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoAddBias) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
      " attr { key: 'T'                value { type: DT_FLOAT } }"
      " attr { key: 'data_format'      value { s: 'NCHW' } }"
      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}");
+      " input: ['A', 'M', 'B', 'N']}");
  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D)|"
-             "A->C;B->C:1");
+            "A(Input);B(Input);C(MklConv2D);M(MklInput);N(MklInput)|"
+             "A->C;B->C:2;M->C:1;N->C:3");
 }

-// Conv2D output does not go to BiasAdd.
+// MklConv2D output does not go to BiasAdd.
 TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow1) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
      " attr { key: 'T'                value { type: DT_FLOAT } }"
      " attr { key: 'data_format'      value { s: 'NCHW' } }"
      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
+      " input: ['A', 'M', 'B', 'N']}"
      "node { name: 'D' op: 'Input'}"
      "node { name: 'E' op: 'Input'}"
      "node { name: 'F' op: 'BiasAdd'"
      " attr { key: 'T'                value { type: DT_FLOAT } }"
      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }");  // Output of Conv2D does not go to BiasAdd.
+      " input: ['D', 'E'] }");  // Output of MklConv2D does not go to BiasAdd.
  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(Input);F(BiasAdd)|"
-             "A->C;B->C:1;D->F;E->F:1");
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "M(MklInput);N(MklInput)|A->C;B->C:2;D->F;E->F:1;M->C:1;N->C:3");
 }

-// Conv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
+// MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
 // Merge should not be done in such case.
 TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
      " attr { key: 'T'                value { type: DT_FLOAT } }"
      " attr { key: 'data_format'      value { s: 'NCHW' } }"
      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
+      " input: ['A', 'M', 'B', 'N']}"
      "node { name: 'D' op: 'Input'}"
      "node { name: 'E' op: 'Input'}"
      "node { name: 'F' op: 'BiasAdd'"
@ -211,8 +248,9 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
      " attr { key: 'T'                value { type: DT_FLOAT } }"
      " input: ['C', 'E'] }");
  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(Input);F(BiasAdd);G(Add)|"
-             "A->C;B->C:1;C->G;D->F;E->F:1;E->G:1");
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "G(Add);M(MklInput);N(MklInput)|A->C;B->C:2;C->G;D->F;"
+            "E->F:1;E->G:1;M->C:1;N->C:3");
 }

 // data_format attribute value mismatch. Merge should not be done
@ -220,28 +258,63 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
 TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_AttrMismatch) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
      " attr { key: 'T'                value { type: DT_FLOAT } }"
      " attr { key: 'data_format'      value { s: 'NCHW' } }"
      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
+      " input: ['A', 'M', 'B', 'N']}"
      "node { name: 'D' op: 'Input'}"
      "node { name: 'E' op: 'BiasAdd'"
      " attr { key: 'T'                value { type: DT_FLOAT } }"
      " attr { key: 'data_format'      value { s: 'NHCW' } }"
      " input: ['C', 'D'] }");
  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd)|"
-            "A->C;B->C:1;C->E;D->E:1");
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(BiasAdd);M(MklInput);"
+            "N(MklInput)|A->C;B->C:2;C->E;D->E:1;M->C:1;N->C:3");
 }

-// Test set 2: Conv2D..BiasAddGrad -> Conv2DWithBiasBackpropBias rewrite tests
+#if 0
+// This test set is disabled temporarily as we do not enable node rewrite.
+// This test set will be enabled when we support Mkl-specific kernels for
+// backward bias.
+//
+// Test set 2: MklConv2D..BiasAddGrad -> Conv2DWithBiasBackpropBias
+// rewrite tests

-// C=Conv2D(A,B); D=Sub(C,A); F=BiasAddGrad(D)
+// C=MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D)
 TEST_F(OptimizerMergeTest, Conv2DBackprop_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoNodeMerge(),
+            "A(Input);B(Input);C(MklConv2D);D(Sub);E(Conv2DWithBiasBackpropBias);"
+            "M(MklInput);N(MklInput)|A->C;A->D:1;B->C:2;C->D;D->E;M->C:1;N->C:3");
+}
+
+// No MklConv2D in context, but Conv2D in context. No rewrite should happen.
+// C=Conv2D(A,B); D=Sub(C,A); E=BiasAddGrad(D)
+TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoMklConv2D) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
      "node { name: 'B' op: 'Input'}"
@ -260,12 +333,12 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Positive) {
      " attr { key: 'data_format'      value { s: 'NCHW' } }"
      " input: ['D'] }");
  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Sub);E(Conv2DWithBiasBackpropBias)|"
+            "A(Input);B(Input);C(Conv2D);D(Sub);E(BiasAddGrad)|"
             "A->C;A->D:1;B->C:1;C->D;D->E");
 }

 // No Conv2D in the context for BiasAddGrad. No rewrite should happen.
-// C=Add(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
 TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
@ -287,7 +360,7 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D) {

 // No Conv2D in the context for BiasAddGrad, but MatMul in context.
 // Rewrite should happen, but name of BiasAddGrad does not change.
-// C=MatMul(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
 TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D_MatMul) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
@ -310,7 +383,7 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D_MatMul) {
 }

 // Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
-// C=MatMul(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
+// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
 TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Positive) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
@ -333,7 +406,7 @@ TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Positive) {
 }

 // No MatMul in the context for BiasAddGrad. No rewrite should happen.
-// C=Add(A,B); D=Sub(C,A); F=BiasAddGrad(D,E)
+// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
 TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Negative_NoMatMul) {
  InitGraph(
      "node { name: 'A' op: 'Input'}"
@ -352,7 +425,7 @@ TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Negative_NoMatMul) {
            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
             "A->C;A->D:1;B->C:1;C->D;D->E");
 }
-
+#endif

 static void BM_NodeMerge(int iters, int op_nodes) {
  testing::StopTiming();
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@ -0,0 +1,271 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <set>
+#include <vector>
+#include <queue>
+#include <utility>
+#include <string>
+#include <memory>
+
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+// This pass inserts Mkl to Tf tensor conversion nodes (represented by C)
+// in the graph in between A and B, where A and B match any one
+// of the following
+// cases:
+//  1) A = layer/Op that generates output in Mkl format and,
+//     B = layer/Op that does not accept input in Mkl format and,
+//     A -> B (there is a direct edge between A and B, then
+//     We will insert C such that A->C->B.
+//
+//  2) A = layer/Op that generates output in Mkl format and,
+//     B = NULL (in other words, A is the last layer in the graph), then
+//     We will insert C such that A->C->B. (C will be the last layer.)
+//
+//  Note that case 1 applies to all outputs of A that are input to B.
+//  In other words, the conversions will be required for every output
+//  of A that is input to B. For example, let us say the output of A
+//  is A1, A2, A3, of which A1 and A2 are in Mkl format, but A3 is not
+//  in Mkl format, and all of them are input to B. In such case, we will
+//  do the conversion for A1 and A2 only. We do not need to do any conversion
+//  for A3.
+//
+// This pass relies on layers registering themselves about their Mkl compliant.
+// Mkl compliant layer can accept inputs in Mkl format, and produce output in
+// Mkl format. Non-compliant layer accepts inputs and outputs in
+// TensorFlow format.
+//
+class MklToTfConversionPass : public GraphOptimizationPass {
+ public:
+  MklToTfConversionPass() {}
+  Status Run(const GraphOptimizationPassOptions& options);
+
+  // Insert layout conversion node in the graph pointed by g.
+  // Function scans the graph for candidate edges where we
+  // need to insert conversion nodes.
+  //
+  // @return true even if single conversion node is inserted;
+  // false, otherwise.
+  bool RunPass(std::unique_ptr<Graph>* g);
+
+
+ private:
+  // Is the input Op supported by Mkl-specific layout?
+  //
+  // @input op_name string of the op
+  // @return true if op is Mkl supported; false, otherwise.
+  inline bool IsMklSupportedOp(const string& op_name) const {
+    return mkl_layer_registry::IsMklLayer(op_name);
+  }
+
+  // Insert layout conversion node on the edge pointed by 'e' from graph 'g'.
+  //
+  // Edge will be deleted once a call to this function is successful.
+  // Any attempt to use the edge after this call
+  // will lead to undefined behaviors.
+  //
+  // @return Success:OK() if insertion is successful, otherwise returns
+  //         appropriate error status code.
+  Status InsertConversionNodeOnEdge(std::unique_ptr<Graph>* g, Edge*);
+};
+
+// We register MklToTf insertion for phase 1 in post-partition grouping.
+// We register this pass after partitioning so that we get a complete
+// picture of inputs and outputs of the nodes in the graphs.
+const OptimizationPassRegistry::Grouping kMklTfConvPassGroup =
+  OptimizationPassRegistry::POST_PARTITIONING;
+REGISTER_OPTIMIZATION(kMklTfConvPassGroup, 1, MklToTfConversionPass);
+
+Status MklToTfConversionPass::InsertConversionNodeOnEdge(
+    std::unique_ptr<Graph>* g, Edge *e) {
+  CHECK_NOTNULL(e);
+
+  Node* src = e->src();
+  Node* dst = e->dst();
+
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(dst);
+
+  Node* conversion_node = nullptr;
+  DataType src_datatype = DT_INVALID;
+  DataType dst_datatype = DT_INVALID;
+  string data_format;
+
+  TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
+  TF_CHECK_OK(GetNodeAttr(dst->def(), "T", &dst_datatype));
+  if (src_datatype != dst_datatype) {
+    string err_msg = "T attribute of " + src->name() + " and " +
+                      dst->name() + " do not match. Will not insert" +
+                     " MklToTf node in such case.";
+    return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
+  }
+
+  // Lets build the conversion node and specify src as input.
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("Mkl2Tf"), "MklToTf")
+        .Input(src, e->src_output())
+        .Input(src, e->src_output()+1)  // Mkl tensor immediately
+                                        // follows Tf tensor.
+        .Device(src->def().device())  // We want to get conversion node
+                                      // on same device as source node.
+        .Attr("T", src_datatype)
+        .Finalize(&**g, &conversion_node));
+
+  CHECK_NOTNULL(conversion_node);
+  if (GetNodeAttr(src->def(), "data_format", &data_format) == Status::OK()) {
+    conversion_node->AddAttr("data_format", data_format);
+  }
+
+  // Get assigned device from source node and apply it to conversion node.
+  // We want conversion node to be on the same device as the source node.
+  conversion_node->set_assigned_device_name(src->assigned_device_name());
+
+  // Set the Mkl layer label for this op.
+  conversion_node->AddAttr("_kernel", mkl_layer_registry::kMklLayerLabel);
+
+  // Now that we have added edge from src->conversion_node, let's add edge from
+  // output of conversion_node to the dest node. Since conversion_node
+  // has only 1 output, the src_output of conversion_node is 0.
+  CHECK_NOTNULL((*g)->AddEdge(conversion_node, 0, dst, e->dst_input()));
+
+  VLOG(1) << "MklToTfConversionPass: Inserting Conversion node on: "
+          << src->type_string() << " and " << dst->type_string()
+          << " successful.";
+
+  // Remove src->dst edge now.
+  (*g)->RemoveEdge(e);
+  return Status::OK();
+}
+
+bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
+  bool result = false;
+
+  CHECK_NOTNULL(g);
+
+  DumpGraph("Before MklToTfConversionPass", &**g);
+
+  // Since we are looking for mkl-supported op node immediately
+  // followed by non-mkl op node, we will just iterate over edge
+  // set of the graph.
+  // vector to maintain candiadate edges whose source and destination
+  // are candidate for inserting conversion node
+  std::vector<Edge*> candidate_edges;
+
+  for (const Edge *e : (*g)->edges()) {
+    Node* src = e->src();
+    Node* dst = e->dst();
+
+    // We skip control edges.
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    VLOG(1) << "MklToTfConversionPass: InsertConversionNodes: "
+            << src->type_string() << " and " << dst->type_string();
+
+    // Let's get source and destination data type.
+    DataType src_datatype = DT_INVALID;
+    if (GetNodeAttr(src->def(), "T", &src_datatype) != Status::OK()) {
+      continue;
+    }
+    // We cannot check datatype on destination node because destination node
+    // may not be Mkl node.
+    DataType dst_datatype = DT_INVALID;
+    GetNodeAttr(dst->def(), "T", &dst_datatype);
+
+    // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
+    if (IsMklSupportedOp(src->type_string()) &&
+       !IsMklSupportedOp(dst->type_string())) {
+      VLOG(1) << "MklToTfConversionPass: Scheduled nodes " << src->name()
+              << " and " << dst->name() << " for inserting conversion nodes";
+      candidate_edges.push_back(const_cast<Edge*>(e));
+    }
+  }
+
+  // Process all candidate edges and insert conversion nodes on them.
+  for (Edge* e : candidate_edges) {
+    // Even if we insert conversion node on a single edge, we
+    // need to return true.
+    string src_name = e->src()->name();
+    string dst_name = e->dst()->name();
+    if (InsertConversionNodeOnEdge(g, e) == Status::OK()) {
+      VLOG(1) << "MklToTfConversionPass: Inserted conversion "
+              << "node on edge between " << src_name << " and " << dst_name;
+      result = true;
+    }
+  }
+
+  DumpGraph("After MklToTfConversionPass", &**g);
+
+  // We need to return true even if we insert one conversion node
+  // anywhere in the graph.
+  return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+//////////////////////////////////////////////////////////////////////////////
+
+bool InsertMklToTfConversionNodes(std::unique_ptr<Graph>* g) {
+  return MklToTfConversionPass().RunPass(g);
+}
+
+Status MklToTfConversionPass::Run(
+  const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
+    return Status::OK();
+  }
+
+  auto process_graph = [&](std::unique_ptr<Graph>* g) {
+    // Get the ownership of graph
+    std::unique_ptr<Graph>* ng = std::move(g);
+    RunPass(ng);
+    // Return the ownership of graph back
+    g->reset(ng->release());
+  };
+
+  if (kMklTfConvPassGroup != OptimizationPassRegistry::POST_PARTITIONING) {
+    // For any pre-partitioning phase, graph is stored in options.graph.
+    process_graph(options.graph);
+  } else {
+    // For post partitioning phase, graphs are stored in
+    // options.partition_graphs.
+    for (auto& pg : *options.partition_graphs) {
+      process_graph(&pg.second);
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif
--- a/tensorflow/core/graph/mkl_tfconversion_pass.h
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.h
@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An optimization pass that inserts MklToTf conversion nodes in the graph
+
+#ifndef TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
+#define TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
+
+#ifdef INTEL_MKL
+
+#include <sys/types.h>
+#include <memory>
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Interface to invoke the pass for unit test
+//
+// Returns true if and only if 'g' is mutated.
+extern bool InsertMklToTfConversionNodes(std::unique_ptr<Graph>* g);
+}  // namespace tensorflow
+
+#endif
+
+#endif  // TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@ -0,0 +1,243 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
+
+#include <vector>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class MklToTfConversionPass : public ::testing::Test {
+ public:
+  MklToTfConversionPass() : graph_(OpRegistry::Global()) {}
+
+  static void InitGraph(const string& s, Graph* graph) {
+    GraphDef graph_def;
+
+    auto parser = protobuf::TextFormat::Parser();
+    CHECK(parser.MergeFromString(s, &graph_def)) << s;
+    GraphConstructorOptions opts;
+    TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+  }
+
+  void InitGraph(const string& s) {
+    InitGraph(s, &graph_);
+    original_ = CanonicalGraphString(&graph_);
+  }
+
+  static bool IncludeNode(const Node* n) { return n->IsOp(); }
+
+  static string EdgeId(const Node* n, int index) {
+    if (index == 0) {
+      return n->name();
+    } else if (index == Graph::kControlSlot) {
+      return strings::StrCat(n->name(), ":control");
+    } else {
+      return strings::StrCat(n->name(), ":", index);
+    }
+  }
+
+  string CanonicalGraphString(Graph* g) {
+    std::vector<string> nodes;
+    std::vector<string> edges;
+    for (const Node* n : g->nodes()) {
+      if (IncludeNode(n)) {
+        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
+      }
+    }
+    for (const Edge* e : g->edges()) {
+      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
+        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
+                                        EdgeId(e->dst(), e->dst_input())));
+      }
+    }
+    // Canonicalize
+    std::sort(nodes.begin(), nodes.end());
+    std::sort(edges.begin(), edges.end());
+    return strings::StrCat(str_util::Join(nodes, ";"), "|",
+                           str_util::Join(edges, ";"));
+  }
+
+  string DoRunMklToTfConversionPass() {
+    string before = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "Before MklToTf conversion pass: " << before;
+
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
+    InsertMklToTfConversionNodes(ug);
+
+    string result = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "After MklToTf conversion pass:  " << result;
+    return result;
+  }
+
+  const string& OriginalGraph() const { return original_; }
+
+  Graph graph_;
+  string original_;
+};
+
+REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
+
+TEST_F(MklToTfConversionPass, Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoRunMklToTfConversionPass(),
+            "A(Input);B(Input);C(Mul);D(Mul)|"
+            "A->C;A->D;B->C:1;B->D:1");
+}
+
+// MklConv2D followed by Non-Mkl layer
+// C=MklConv2D(A,M,B,N); E=Sub(C,D)
+TEST_F(MklToTfConversionPass, Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}");
+  EXPECT_EQ(DoRunMklToTfConversionPass(),
+            "A(Input);B(Input);C(MklConv2D);D(Input);E(Sub);M(MklInput);"
+            "Mkl2Tf/_0(MklToTf);N(MklInput)|A->C;B->C:2;C->Mkl2Tf/_0;"
+            "C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
+}
+
+// MklConv2D followed by Non-Mkl layer, and MklConv2D uses half type
+// C=MklConv2D(A,M,B,N); E=Sub(C,D)
+// MklToTf node should be inserted.
+TEST_F(MklToTfConversionPass, Positive_Type) {
+  InitGraph(
+      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'M' op: 'MklInput'}"
+      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'N' op: 'MklInput'}"
+      "node { name: 'C' op: 'MklConv2D'"
+      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'M', 'B', 'N']}"
+      "node { name: 'D' op: 'HalfInput'}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_HALF } }"
+      " input: ['C', 'D']}");
+  EXPECT_EQ(DoRunMklToTfConversionPass(),
+            "A(HalfInput);B(HalfInput);C(MklConv2D);D(HalfInput);"
+            "E(Sub);M(MklInput);Mkl2Tf/_0(MklToTf);N(MklInput)|"
+            "A->C;B->C:2;C->Mkl2Tf/_0;C:1->Mkl2Tf/_0:1;D->E:1;"
+            "M->C:1;Mkl2Tf/_0->E;N->C:3");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
+// There is no Mkl layer so no conversion op should be inserted.
+TEST_F(MklToTfConversionPass, Negative_NoMklLayer) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoRunMklToTfConversionPass(),
+            "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|"
+             "A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1");
+}
+
+static void BM_RunMklToTfConversionPass(int iters, int op_nodes) {
+  testing::StopTiming();
+  string s;
+  for (int in = 0; in < 10; in++) {
+    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+  }
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < op_nodes; op++) {
+    s += strings::Printf(
+        "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
+        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
+        op, rnd.Uniform(10), rnd.Uniform(10));
+  }
+
+  bool first = true;
+  while (iters > 0) {
+    Graph* graph = new Graph(OpRegistry::Global());
+    MklToTfConversionPass::InitGraph(s, graph);
+    int N = graph->num_node_ids();
+    if (first) {
+      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
+      first = false;
+    }
+    {
+      testing::StartTiming();
+      std::unique_ptr<Graph> ug(graph);
+      InsertMklToTfConversionNodes(&ug);
+      testing::StopTiming();
+    }
+    iters -= N;  // Our benchmark units are individual graph nodes,
+                 // not whole graphs
+    // delete graph;
+  }
+}
+BENCHMARK(BM_RunMklToTfConversionPass)->Arg(1000)->Arg(10000);
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif /* INTEL_MKL */
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -688,8 +688,15 @@ tf_kernel_library(

 tf_kernel_library(
    name = "transpose_op",
-    prefix = "transpose_op",
-    deps = ARRAY_DEPS,
+    srcs = [
+        "transpose_op.cc",
+    ] + if_mkl([
+        "mkl_transpose_op.cc",
+    ]),
+    hdrs = ["transpose_op.h"],
+    deps = ARRAY_DEPS + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+    ]),
 )

 tf_kernel_library(
@ -1735,6 +1742,22 @@ tf_cuda_cc_test(
    ],
 )

+tf_cuda_cc_test(
+    name = "resize_benchmark_test",
+    srcs = ["resize_op_benchmark_test.cc"],
+    deps = [
+        ":image",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
    name = "io",
    deps = [
@ -4376,7 +4399,7 @@ tf_cc_test(

 if_mkl(
    tf_kernel_library(
-        name = "mkl_ops",
+        name = "mkl_matmul_op",
        prefix = "mkl_matmul",
        deps = [
            ":math",
@ -4385,6 +4408,40 @@ if_mkl(
    ),
 )

+if_mkl(
+    tf_kernel_library(
+        name = "mkl_conv_op",
+        prefix = "mkl_conv",
+        deps = [
+            ":bounds_check",
+            ":ops_util",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:nn_ops_op_lib",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
+if_mkl(
+    tf_kernel_library(
+        name = "mkl_tfconv_op",
+        prefix = "mkl_tfconv",
+        deps = [
+            ":bounds_check",
+            ":ops_util",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:nn_ops_op_lib",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.

--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/adjust_hue_op.cc
@ -1,5 +1,4 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@ -12,16 +11,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
 #include <memory>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/adjust_hue_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"

 namespace tensorflow {

@ -77,6 +84,7 @@ template <class Device>
 class AdjustHueOp;

 namespace internal {
+
 // Helper function to convert a RGB color to H-and-V-range. H is in the range
 // of [0, 6] instead of the normal [0, 1]
 static void rgb_to_hv_range(float r, float g, float b, float* h, float* v_min,
@ -185,6 +193,7 @@ static void hv_range_to_rgb(float h, float v_min, float v_max, float* r,
 }
 }  // namespace internal

+
 template <>
 class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
 public:
@ -237,4 +246,34 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
 REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_CPU),
                        AdjustHueOp<CPUDevice>);

+#if GOOGLE_CUDA
+template <>
+class AdjustHueOp<GPUDevice> : public AdjustHueOpBase {
+ public:
+  explicit AdjustHueOp(OpKernelConstruction* context)
+      : AdjustHueOpBase(context) {}
+
+  virtual void DoCompute(OpKernelContext* context, const ComputeOptions& options) override {
+    const Tensor* input = options.input;
+    const Tensor* delta = options.delta;
+    Tensor* output = options.output;
+    const int64 number_of_elements = input->NumElements();
+    GPUDevice device = context->eigen_gpu_device();
+    const auto stream = device.stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+    if (number_of_elements > 0) {
+      const float* input_data = input->flat<float>().data();
+      const float* delta_h = delta->flat<float>().data();
+      float* const output_data = output->flat<float>().data();
+      functor::AdjustHueGPU()(&device, number_of_elements, input_data, delta_h,
+                              output_data);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_GPU), AdjustHueOp<GPUDevice>);
+
+#endif
+
+//} // namespace functor
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/adjust_hue_op.h
+++ b/tensorflow/core/kernels/adjust_hue_op.h
@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
+#define _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+struct AdjustHueGPU {
+  void operator()(
+      GPUDevice* device,
+      const int64 number_of_elements,
+      const float* const input,
+      const float* const delta,
+      float* const output
+  );
+};
+
+} // namespace functor
+} // namespace tensorflow
+
+#endif // GOOGLE_CUDA
+#endif // _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
--- a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
@ -0,0 +1,141 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/adjust_hue_op.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+namespace internal {
+
+namespace {
+  typedef struct RgbTuple {
+    float r;
+    float g;
+    float b;
+  } RgbTuple;
+
+  typedef struct HsvTuple {
+    float h;
+    float s;
+    float v;
+  } HsvTuple;
+}  // anon namespace
+
+__device__ HsvTuple rgb2hsv_cuda(const float r, const float g, const float b)
+{
+  HsvTuple tuple;
+  const float M = fmaxf(r, fmaxf(g, b));
+  const float m = fminf(r, fminf(g, b));
+  const float chroma = M - m;
+  float h = 0.0f, s = 0.0f;
+  // hue
+  if (chroma > 0.0f) {
+    if (M == r) {
+      const float num = (g - b) / chroma;
+      const float sign = copysignf(1.0f, num);
+      h = ((sign < 0.0f) * 6.0f + sign * fmodf(sign * num, 6.0f)) / 6.0f;
+    } else if (M == g) {
+      h = ((b - r) / chroma + 2.0f) / 6.0f;
+    } else {
+      h = ((r - g) / chroma + 4.0f) / 6.0f;
+    }
+  } else {
+    h = 0.0f;
+  }
+  // saturation
+  if (M > 0.0) {
+    s = chroma / M;
+  } else {
+    s = 0.0f;
+  }
+  tuple.h = h;
+  tuple.s = s;
+  tuple.v = M;
+  return tuple;
+}
+
+__device__ RgbTuple hsv2rgb_cuda(const float h, const float s, const float v)
+{
+  RgbTuple tuple;
+  const float new_h = h * 6.0f;
+  const float chroma = v * s;
+  const float x = chroma * (1.0f - fabsf(fmodf(new_h, 2.0f) - 1.0f));
+  const float new_m = v - chroma;
+  const bool between_0_and_1 = new_h >= 0.0f && new_h < 1.0f;
+  const bool between_1_and_2 = new_h >= 1.0f && new_h < 2.0f;
+  const bool between_2_and_3 = new_h >= 2.0f && new_h < 3.0f;
+  const bool between_3_and_4 = new_h >= 3.0f && new_h < 4.0f;
+  const bool between_4_and_5 = new_h >= 4.0f && new_h < 5.0f;
+  const bool between_5_and_6 = new_h >= 5.0f && new_h < 6.0f;
+  tuple.r = chroma * (between_0_and_1 || between_5_and_6) +
+      x * (between_1_and_2 || between_4_and_5) + new_m;
+  tuple.g = chroma * (between_1_and_2 || between_2_and_3) +
+      x * (between_0_and_1 || between_3_and_4) + new_m;
+  tuple.b = chroma * (between_3_and_4 || between_4_and_5) +
+      x * (between_2_and_3 || between_5_and_6) + new_m;
+  return tuple;
+}
+
+__global__ void adjust_hue_nhwc(const int64 number_elements,
+                                const float * const __restrict__ input,
+                                float * const output,
+                                const float * const hue_delta)
+{
+  // multiply by 3 since we're dealing with contiguous RGB bytes for each pixel (NHWC)
+  const int64 idx = (blockDim.x * blockIdx.x + threadIdx.x) * 3;
+  // bounds check
+  if (idx > number_elements - 1) {
+    return;
+  }
+  const float delta = hue_delta[0];
+  const HsvTuple hsv = rgb2hsv_cuda(input[idx], input[idx + 1], input[idx + 2]);
+  // hue adjustment
+  float new_h = fmodf(hsv.h + delta, 1.0f);
+  if (new_h < 0.0f) {
+    new_h = fmodf(1.0f + new_h, 1.0f);
+  }
+  const RgbTuple rgb = hsv2rgb_cuda(new_h, hsv.s, hsv.v);
+  output[idx] = rgb.r;
+  output[idx + 1] = rgb.g;
+  output[idx + 2] = rgb.b;
+}
+} // namespace internal
+
+
+namespace functor {
+
+void AdjustHueGPU::operator()(
+  GPUDevice* device,
+  const int64 number_of_elements,
+  const float* const input,
+  const float* const delta,
+  float* const output
+) {
+  const auto stream = device->stream();
+  const CudaLaunchConfig config = GetCudaLaunchConfig(number_of_elements, *device);
+  const int threads_per_block = config.thread_per_block;
+  const int block_count = (number_of_elements + threads_per_block - 1) / threads_per_block;
+  internal::adjust_hue_nhwc<<<block_count, threads_per_block, 0, stream>>>(
+    number_of_elements, input, output, delta
+  );
+}
+} // namespace functor
+}  // namespace tensorflow
+#endif // GOOGLE_CUDA
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@ -338,6 +338,7 @@ struct AvgPoolMeanReducer {
 // In the case below, 0xd8 implies (false_mask) ? (b) : (a)
 // For details, refer to the vpternlogd instruction table at
 // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
+
 #define psel(a, b, false_mask)                        \
  _mm512_castsi512_ps(_mm512_ternarylogic_epi32(      \
      _mm512_castps_si512(a), _mm512_castps_si512(b), \
--- a/tensorflow/core/kernels/fixed_length_record_reader_op.cc
+++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
@ -40,8 +40,8 @@ class FixedLengthRecordReader : public ReaderBase {

  // On success:
  // * input_buffer_ != nullptr,
-  // * input_buffer_->Tell() == footer_bytes_
-  // * file_pos_limit_ == file size - header_bytes_
+  // * input_buffer_->Tell() == header_bytes_
+  // * file_pos_limit_ == file size - footer_bytes_
  Status OnWorkStartedLocked() override {
    record_number_ = 0;
    uint64 file_size = 0;
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@ -0,0 +1,457 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+
+#include <string.h>
+#include <map>
+#include <vector>
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/util/mkl_util.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T, bool biasEnabled>
+class MklConv2DOp : public OpKernel {
+ public:
+  ~MklConv2DOp() {}
+
+  explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+
+    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, stride_n == 1 && stride_c == 1,
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &(mkl_params_.input_shape));
+    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
+
+    const Tensor& filter = MklGetInput(context, 1);
+    MklShape mkl_filter_shape;
+    GetMklShape(context, 1, &mkl_filter_shape);
+    CHECK(!mkl_filter_shape.IsMklTensor())
+        << "Conv filter should not be in MKL Layout";
+
+    if (biasEnabled) {
+      const Tensor& bias = MklGetInput(context, 2);
+      OP_REQUIRES(context, bias.dims() == 1,
+                  errors::InvalidArgument("bias must be 1-dimensional: ",
+                                          bias.shape().DebugString()));
+    }
+
+    if (!input_in_mkl_format) {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input.shape().DebugString()));
+    }
+
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    for (int i = 0; i < 3; i++) {
+      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+                                           std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
+    }
+
+    const int64 input_depth = input_in_mkl_format
+                                  ? mkl_params_.input_shape.GetSizes()[2]
+                                  : GetTensorDim(input, data_format_, 'C');
+    OP_REQUIRES(
+        context, input_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                input_depth, " vs ", filter.dim_size(2)));
+    // The last dimension for filter is out_depth.
+    const int out_depth = static_cast<int>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 input_rows_raw = input_in_mkl_format
+                                     ? mkl_params_.input_shape.GetSizes()[1]
+                                     : GetTensorDim(input, data_format_, 'H');
+    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input rows too large"));
+    const int input_rows = static_cast<int>(input_rows_raw);
+    const int filter_rows = static_cast<int>(filter.dim_size(0));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 input_cols_raw = input_in_mkl_format
+                                     ? mkl_params_.input_shape.GetSizes()[0]
+                                     : GetTensorDim(input, data_format_, 'W');
+    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("Input cols too large"));
+    const int input_cols = static_cast<int>(input_cols_raw);
+    const int filter_cols = static_cast<int>(filter.dim_size(1));
+
+    // The first dimension for input is batch.
+    const int64 input_batch_raw = input_in_mkl_format
+                                      ? mkl_params_.input_shape.GetSizes()[3]
+                                      : GetTensorDim(input, data_format_, 'N');
+    OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("batch is too large"));
+    const int batch = static_cast<int>(input_batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+    const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      // TODO(jbobba): Verify correctness here
+      //               Need semantics for Null MKL tensor
+      return;
+    }
+
+    if (batch == 0) {
+      // Nothing to do, allocate output tensor and return
+      MklShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(false);
+      AllocateOutputSetMklshape(context, 0, &output, input.shape(),
+                                mkl_output_mkl_shape);
+      return;
+    }
+
+    // Create MKL convolution primitives
+    mkl_params_.in_dims = input_in_mkl_format
+                              ? mkl_params_.input_shape.GetDimension()
+                              : input.dims();
+    mkl_params_.filter_dims = filter.dims();
+    mkl_params_.in_sizes[0] = static_cast<size_t>(input_cols);
+    mkl_params_.in_sizes[1] = static_cast<size_t>(input_rows);
+    mkl_params_.in_sizes[2] = static_cast<size_t>(input_depth);
+    mkl_params_.in_sizes[3] = static_cast<size_t>(batch);
+    mkl_params_.out_sizes[0] = static_cast<size_t>(out_cols);
+    mkl_params_.out_sizes[1] = static_cast<size_t>(out_rows);
+    mkl_params_.out_sizes[2] = static_cast<size_t>(out_depth);
+    mkl_params_.out_sizes[3] = static_cast<size_t>(batch);
+    mkl_params_.input_offset[0] = static_cast<int>(-pad_cols);
+    mkl_params_.input_offset[1] = static_cast<int>(-pad_rows);
+    mkl_params_.conv_stride[0] = static_cast<size_t>(stride_cols);
+    mkl_params_.conv_stride[1] = static_cast<size_t>(stride_rows);
+
+    GetStridesFromSizes(data_format_, mkl_params_.out_strides,
+                        mkl_params_.out_sizes);
+    GetStridesFromSizes(data_format_, mkl_params_.in_strides,
+                        mkl_params_.in_sizes);
+
+    // TF filter dimension order (out_depth, in_depth, cols, rows) ->
+    // MKL filter dimension order (out_depth, in_depth, rows, cols)
+    mkl_params_.filter_sizes[0] = filter.dim_size(1);  // cols
+    mkl_params_.filter_sizes[1] = filter.dim_size(0);  // rows
+    mkl_params_.filter_sizes[2] = filter.dim_size(2);  // in_depth
+    mkl_params_.filter_sizes[3] = filter.dim_size(3);  // out_depth
+
+    // TF filter layout - (rows, cols, in_depth, out_depth)
+    mkl_params_.filter_strides[0] =
+        filter.dim_size(2) * filter.dim_size(3);  // cols
+    mkl_params_.filter_strides[1] =
+        filter.dim_size(1) * filter.dim_size(2) * filter.dim_size(3);  // rows
+    mkl_params_.filter_strides[2] = filter.dim_size(3);  // in_depth
+    mkl_params_.filter_strides[3] = 1;                   // out_depth
+
+    if (biasEnabled) {
+      const Tensor& bias = MklGetInput(context, 2);
+      mkl_params_.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
+      mkl_params_.bias_strides[0] = {1};
+    }
+
+    // Create Convolution Primitive
+    if (biasEnabled) {
+      CHECK_EQ(dnnConvolutionCreateForwardBias_F32(
+                   &mkl_prim_convolution_fwd_, nullptr,
+                   dnnAlgorithmConvolutionDirect, mkl_params_.in_dims,
+                   mkl_params_.in_sizes, mkl_params_.out_sizes,
+                   mkl_params_.filter_sizes, mkl_params_.conv_stride,
+                   mkl_params_.input_offset, dnnBorderZeros),
+               E_SUCCESS);
+    } else {
+      CHECK_EQ(dnnConvolutionCreateForward_F32(
+                   &mkl_prim_convolution_fwd_, nullptr,
+                   dnnAlgorithmConvolutionDirect, mkl_params_.in_dims,
+                   mkl_params_.in_sizes, mkl_params_.out_sizes,
+                   mkl_params_.filter_sizes, mkl_params_.conv_stride,
+                   mkl_params_.input_offset, dnnBorderZeros),
+               E_SUCCESS);
+    }
+
+    TensorShape mkl_output_tf_shape;
+    MklShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(true);
+    mkl_output_mkl_shape.SetMklLayout(mkl_prim_convolution_fwd_,
+                                      dnnResourceDst);
+    mkl_output_mkl_shape.SetTfLayout(mkl_params_.in_dims, mkl_params_.out_sizes,
+                                     mkl_params_.out_strides);
+    mkl_output_tf_shape.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+        sizeof(T));
+    AllocateOutputSetMklshape(context, 0, &output, mkl_output_tf_shape,
+                              mkl_output_mkl_shape);
+    mkl_conv_res_[dnnResourceDst] =
+        static_cast<void*>(output->flat<T>().data());
+
+    MklCreateInputLayouts(context);
+
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
+        mkl_tmp_bias_buf_tensor;  // Temp tensor used to allocate tmp
+                                  // buffers
+    MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
+                                &mkl_tmp_filter_buf_tensor,
+                                &mkl_tmp_bias_buf_tensor);
+
+    // Execute convolution
+    CHECK_EQ(dnnExecute_F32(mkl_prim_convolution_fwd_, mkl_conv_res_),
+             E_SUCCESS);
+
+    MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    int filter_dims;
+    size_t filter_sizes[4];
+    size_t filter_strides[4];
+    size_t bias_sizes[1];
+    size_t bias_strides[1];
+    int input_offset[2];
+    size_t conv_stride[2];
+    MklShape input_shape;
+  } MklConv2DOpParams;
+
+  // Create MKL dnnLayout_t objects for tensors coming into the layer
+  void MklCreateInputLayouts(OpKernelContext* context) {
+    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
+    if (input_in_mkl_format) {
+      mkl_lt_input_ =
+          static_cast<dnnLayout_t>(mkl_params_.input_shape.GetCurLayout());
+    } else {
+      CHECK_EQ(
+          dnnLayoutCreate_F32(&mkl_lt_input_, mkl_params_.in_dims,
+                              mkl_params_.in_sizes, mkl_params_.in_strides),
+          E_SUCCESS);
+    }
+
+    CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_filter_, mkl_params_.filter_dims,
+                                 mkl_params_.filter_sizes,
+                                 mkl_params_.filter_strides),
+             E_SUCCESS);
+
+    if (biasEnabled) {
+      CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_bias_, 1, mkl_params_.bias_sizes,
+                                   mkl_params_.bias_strides),
+               E_SUCCESS);
+    }
+  }
+
+  // Compare incoming tensor layouts with MKL preferred layouts and convert
+  // data to the preferred layout if necessary
+  void MklPrepareConvolutionInputs(OpKernelContext* context,
+                                   Tensor* mkl_tmp_input_buf_tensor,
+                                   Tensor* mkl_tmp_filter_buf_tensor,
+                                   Tensor* mkl_tmp_bias_buf_tensor) {
+    bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
+    dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
+        mkl_prim_convert_input;
+    dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
+        mkl_lt_internal_input;
+    void *mkl_buf_convert_input, *mkl_buf_convert_filter,
+         *mkl_buf_convert_bias;
+    mkl_prim_convert_filter = nullptr;
+    mkl_prim_convert_bias   = nullptr;
+    mkl_prim_convert_input  = nullptr;
+    mkl_lt_internal_filter  = nullptr;
+    mkl_lt_internal_bias    = nullptr;
+    mkl_lt_internal_input   = nullptr;
+    mkl_buf_convert_input   = nullptr;
+    mkl_buf_convert_filter  = nullptr;
+    mkl_buf_convert_bias    = nullptr;
+
+    // Compare with internal layouts and convert if needed
+    const Tensor& input = MklGetInput(context, 0);
+    void* mkl_buf_input =
+        const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+    CHECK_EQ(
+        dnnLayoutCreateFromPrimitive_F32(
+            &mkl_lt_internal_input, mkl_prim_convolution_fwd_, dnnResourceSrc),
+        E_SUCCESS);
+    mkl_convert_input =
+        !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input_);
+    if (mkl_convert_input) {
+      CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input_,
+                                       mkl_lt_internal_input),
+               E_SUCCESS);
+      AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                     &mkl_buf_convert_input);
+      CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                        mkl_buf_convert_input),
+               E_SUCCESS);
+      dnnDelete_F32(mkl_prim_convert_input);
+    }
+    dnnLayoutDelete_F32(mkl_lt_internal_input);
+
+    mkl_conv_res_[dnnResourceSrc] =
+        (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
+
+    const Tensor& filter = MklGetInput(context, 1);
+    void* mkl_buf_filter =
+        const_cast<void*>(static_cast<const void*>(filter.flat<T>().data()));
+    CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_filter,
+                                              mkl_prim_convolution_fwd_,
+                                              dnnResourceFilter),
+             E_SUCCESS);
+    mkl_convert_filter =
+        !dnnLayoutCompare_F32(mkl_lt_internal_filter, mkl_lt_filter_);
+    if (mkl_convert_filter) {
+      CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, mkl_lt_filter_,
+                                       mkl_lt_internal_filter),
+               E_SUCCESS);
+      AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor, mkl_lt_internal_filter,
+                     &mkl_buf_convert_filter);
+      CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
+                                        mkl_buf_convert_filter),
+               E_SUCCESS);
+      dnnDelete_F32(mkl_prim_convert_filter);
+    }
+    dnnLayoutDelete_F32(mkl_lt_internal_filter);
+
+    mkl_conv_res_[dnnResourceFilter] =
+        (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
+
+    if (biasEnabled) {
+      const Tensor& bias = MklGetInput(context, 2);
+      void* mkl_buf_bias =
+          const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_bias,
+                                                mkl_prim_convolution_fwd_,
+                                                dnnResourceBias),
+               E_SUCCESS);
+      mkl_convert_bias =
+          !dnnLayoutCompare_F32(mkl_lt_internal_bias, mkl_lt_bias_);
+      if (mkl_convert_bias) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_bias, mkl_lt_bias_,
+                                         mkl_lt_internal_bias),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_bias_buf_tensor, mkl_lt_internal_bias,
+                       &mkl_buf_convert_bias);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_bias, mkl_buf_bias,
+                                          mkl_buf_convert_bias),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_bias);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_bias);
+
+      mkl_conv_res_[dnnResourceBias] =
+          (mkl_convert_bias) ? mkl_buf_convert_bias : mkl_buf_bias;
+    }
+  }
+
+  void MklCleanup() {
+    bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor();
+    dnnDelete_F32(mkl_prim_convolution_fwd_);
+    if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input_);
+    dnnLayoutDelete_F32(mkl_lt_filter_);
+    if (biasEnabled) dnnLayoutDelete_F32(mkl_lt_bias_);
+  }
+  
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+
+  MklConv2DOpParams mkl_params_;
+  dnnPrimitive_t mkl_prim_convolution_fwd_ = nullptr;
+  void* mkl_conv_res_[dnnResourceNumber];
+  dnnLayout_t mkl_lt_filter_ = nullptr, mkl_lt_bias_ = nullptr,
+              mkl_lt_input_ = nullptr;
+  
+
+};
+
+#define REGISTER_MKL_CPU(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MklConv2D").Device(DEVICE_CPU)                                 \
+      .TypeConstraint<T>("T")                                              \
+      .Label(mkl_layer_registry::kMklLayerLabel),                          \
+      MklConv2DOp<CPUDevice, T, false>);                                   \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MklConv2DWithBias").Device(DEVICE_CPU)                         \
+      .TypeConstraint<T>("T")                                              \
+      .Label(mkl_layer_registry::kMklLayerLabel),                          \
+      MklConv2DOp<CPUDevice, T, true>);
+
+TF_CALL_float(REGISTER_MKL_CPU);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <vector>
+#include <algorithm>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/macros.h"
+
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+  typedef Eigen::ThreadPoolDevice CPUDevice;
+
+///////////////////////////////////////////////////////////
+//               Op kernel
+///////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklToTfOp : public OpKernel {
+ public:
+  explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // 1. Check that input tensor is in MKL format.
+    const Tensor& input_tensor = MklGetInput(context, 0);
+    MklShape input_shape;
+    GetMklShape(context, 0, &input_shape);
+
+    // if input is already in Tf format, then just copy input tensor to output.
+    if (!input_shape.IsMklTensor()) {
+      context->set_output(0, input_tensor);
+      VLOG(1) << "MKLToTFConversion: No conversion needed, "
+              << "copying input to output";
+      return;
+    }
+
+    // Check that input data type is same as operator data type and that it is
+    // same as output data type.
+    DataType input_data_type  = input_type(0);
+    DataType output_data_type = output_type(0);
+    CHECK_EQ(op_data_type, input_data_type);
+    CHECK_EQ(op_data_type, output_data_type);
+
+    // We need to recreate Tf tensor shape based on sizes and strides.
+    // Ideally, we should know what the data_format is, but that attribute
+    // to this op is not reliable. So below, we rely of sorting logic where
+    // we sort strides first and then sizes.
+    TensorShape output_shape;
+    std::vector<std::pair<int, int>> shape_size;
+    for (size_t i = 0; i < input_shape.GetDimension(); i++) {
+      VLOG(1) << "Size: " << input_shape.GetSizes()[i]
+              << ", Strides: " << input_shape.GetStrides()[i];
+      shape_size.push_back(std::make_pair(input_shape.GetSizes()[i],
+                                          input_shape.GetStrides()[i]));
+    }
+
+    std::sort(shape_size.begin(), shape_size.end(), [](
+      std::pair<int, int > a, std::pair<int, int> b) {
+      return (a.second > b.second) ||
+             (a.second == b.second && a.first > b.first);
+    });
+
+    for (std::pair<int, int> s_s : shape_size) {
+      VLOG(1) << "Added dimension: " << s_s.first;
+      output_shape.AddDim(s_s.first);
+    }
+
+    // Allocate output tensor.
+    Tensor* output_tensor = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, output_shape, &output_tensor));
+
+    // 3. Get input and output layout pointers.
+    dnnLayout_t output_layout = static_cast<dnnLayout_t>(
+                                  input_shape.GetTfLayout());
+
+    // 4. Execute DNNConversion.
+    void *input_buffer  = static_cast<void*>(const_cast<T*>(
+                              input_tensor.flat<T>().data()));
+    void *output_buffer = static_cast<void*>(const_cast<T*>(
+                              output_tensor->flat<T>().data()));
+    input_shape.GetConvertedFlatData(output_layout, input_buffer,
+                                     output_buffer);
+
+    VLOG(1) << "MKLToTFConversion complete successfully.";
+  }
+
+ private:
+    /// Data format of the operation
+    string data_format_str;
+
+    /// Data type of the operation
+    DataType op_data_type;
+};
+
+///////////////////////////////////////////////////////////
+//               Register kernel
+///////////////////////////////////////////////////////////
+
+#define REGISTER_CPU(T) \
+  REGISTER_KERNEL_BUILDER( \
+    Name("MklToTf").Device(DEVICE_CPU).TypeConstraint<T>("T") \
+    .Label(mkl_layer_registry::kMklLayerLabel), \
+    MklToTfOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_CPU);
+#undef REGISTER_CPU
+}  // namespace tensorflow
+#endif /* INTEL_MKL */
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@ -0,0 +1,67 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/transpose_op.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "third_party/mkl/include/mkl_trans.h"
+
+namespace tensorflow {
+
+// output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
+// of type T and rank N, and a permutation of 0, 1, ..., N-1. It
+// shuffles the dimensions of the input tensor according to permutation.
+//
+// Specifically, the returned tensor output meets the following condition:
+// 1) output.dims() == input.dims();
+// 2) output.dim_size(i) == input.dim_size(perm[i]);
+// 3) output.tensor<T, N>(i_0, i_1, ..., i_N-1) ==
+//      input.tensor<T, N>(j_0, j_1, ..., j_N-1),
+//    where i_s == j_{perm[s]}
+//
+// REQUIRES: perm is a vector of int32.
+// REQUIRES: input.dims() == perm.size().
+// REQUIRES: perm is a permutation.
+
+Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                                      gtl::ArraySlice<int32> perm,
+                                      Tensor* out) {
+  if (in.dims() == 2 && in.dtype() == DT_FLOAT) {
+    float* user_o = out->flat<float>().data();
+    const float* user_i = in.flat<float>().data();
+
+    // Documentation here: https://software.intel.com/en-us/node/520863
+    // Parameters: (ordering:row-major, operation:transpose, num_rows, num_cols,
+    //              alpha (for scaling), array, dist_bet_adjacent_cols/rows
+    //              (source), array, dist_bet_adjacent_cols/rows (dest))
+    mkl_somatcopy('R', 'T', in.dim_size(0), in.dim_size(1), 1,
+                  user_i, in.dim_size(1),
+                  user_o, in.dim_size(0));
+
+    return Status::OK();
+  }
+
+  // Fallback to eigen if transpose parameters not supported by MKL
+  typedef Eigen::ThreadPoolDevice CPUDevice;
+  return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
+                                   out);
+} // MklTransposeCpuOp::DoTranspose
+} // namespace tensorflow
+
+#endif  // INTEL_MKL
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@ -64,6 +64,8 @@ PoolParameters::PoolParameters(OpKernelContext* context,
    OP_REQUIRES_OK(
        context, GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
                                       padding, &out_width, &pad_cols));
+    pad_depth = 0;
+    out_depth = depth;
  } else {
    // Our current version of depthwise max pooling does not support
    // any padding, and expects the depth_window to equal the
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@ -66,9 +66,7 @@ class ResizeNearestNeighborOp : public OpKernel {
          const int64 in_x =
              std::min(static_cast<int64>(floorf(x * st.width_scale)),
                       (st.in_width - 1));
-          for (int c = 0; c < st.channels; ++c) {
-            output_data(b, y, x, c) = input_data(b, in_y, in_x, c);
-          }
+          std::copy_n(&input_data(b, in_y, in_x, 0), st.channels, &output_data(b, y, x, 0));
        }
      }
    }
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc
@ -21,7 +21,8 @@ limitations under the License.

 namespace tensorflow {

-static Graph* BM_ResizeNearestNeighbor(int batches, int width, int height) {
+static Graph* BM_Resize(const char* algorithm,
+                        int batches, int width, int height) {
  Graph* g = new Graph(OpRegistry::Global());
  Tensor in(DT_FLOAT, TensorShape({batches, width, height, 3}));
  in.flat<float>().setRandom();
@ -32,21 +33,26 @@ static Graph* BM_ResizeNearestNeighbor(int batches, int width, int height) {
  out_size_flat(1) = height * 2;

  Node* ret;
-  NodeBuilder(g->NewName("n"), "ResizeNearestNeighbor")
+  Status s = NodeBuilder(g->NewName("n"), algorithm)
                 .Input(test::graph::Constant(g, in))
                 .Input(test::graph::Constant(g, out_size))
                 .Finalize(g, &ret);
+  assert(s.ok());
  return g;
 }

-#define BM_ResizeNearestNeighborDev(DEVICE, B, W, H)                           \
-  static void BM_ResizeNearestNeighbor_##DEVICE##_##B##_##W##_##H(int iters) { \
+#define BM_ResizeDev(DEVICE, ALGORITHM, B, W, H)                               \
+  static void BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H(int iters) {  \
    testing::ItemsProcessed(iters* B* W* H * 3);                               \
-    test::Benchmark(#DEVICE, BM_ResizeNearestNeighbor(B, W, H)).Run(iters);    \
+    test::Benchmark(#DEVICE, BM_Resize(#ALGORITHM, B, W, H)).Run(iters);       \
  }                                                                            \
-  BENCHMARK(BM_ResizeNearestNeighbor_##DEVICE##_##B##_##W##_##H)
+  BENCHMARK(BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H)

-BM_ResizeNearestNeighborDev(cpu, 1, 499, 499);
-BM_ResizeNearestNeighborDev(gpu, 1, 499, 499);
+BM_ResizeDev(cpu, ResizeNearestNeighbor, 10, 499, 499);
+BM_ResizeDev(gpu, ResizeNearestNeighbor, 10, 499, 499);
+
+BM_ResizeDev(cpu, ResizeBilinear, 10, 499, 499);
+BM_ResizeDev(gpu, ResizeBilinear, 10, 499, 499);

 }  // namespace tensorflow
+
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@ -180,6 +180,20 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
                                   out);
 }

+#ifdef INTEL_MKL
+#define REGISTER(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("Transpose")                   \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<T>("T")         \
+                              .TypeConstraint<int32>("Tperm") \
+                              .HostMemory("perm"),            \
+                          MklTransposeCpuOp);
+TF_CALL_ALL_TYPES(REGISTER);
+REGISTER(bfloat16);
+#undef REGISTER
+
+#else  // INTEL_MKL
+
 #define REGISTER(T)                                           \
  REGISTER_KERNEL_BUILDER(Name("Transpose")                   \
                              .Device(DEVICE_CPU)             \
@ -190,6 +204,7 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
 TF_CALL_ALL_TYPES(REGISTER)
 REGISTER(bfloat16);
 #undef REGISTER
+#endif  // INTEL_MKL

 #if GOOGLE_CUDA
 Status TransposeGpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@ -41,6 +41,17 @@ class TransposeCpuOp : public TransposeOp {
                     gtl::ArraySlice<int32> perm, Tensor* out) override;
 };

+#ifdef INTEL_MKL
+class MklTransposeCpuOp : public TransposeOp {
+ public:
+  explicit MklTransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
+
+ protected:
+  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
+                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+};
+#endif  // INTEL_MKL
+
 class TransposeGpuOp : public TransposeOp {
 public:
  explicit TransposeGpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@ -2502,4 +2502,45 @@ scale_after_normalization: A bool indicating whether the resulted tensor
  needs to be multiplied with gamma.
 )doc");

+#ifdef INTEL_MKL
+REGISTER_OP("MklConv2D")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Input("filter: T")
+    .Input("mkl_filter: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+MKL version of Conv2D
+)doc");
+
+REGISTER_OP("MklConv2DWithBias")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Input("filter: T")
+    .Input("mkl_filter: uint8")
+    .Input("bias: T")
+    .Input("mkl_bias: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString());
+
+REGISTER_OP("MklToTf")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .Attr(GetConvnetDataFormatAttrString());
+#endif  // INTEL_MKL
+
 }  // namespace tensorflow
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -25758,6 +25758,59 @@ op {
  summary: "Computes the sum along segments of a tensor."
  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
 }
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    description: "A tensor whose shape is a prefix of `data.shape`."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Computes the max along segments of a tensor."
+  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n  range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
+}
 op {
  name: "Unstage"
  output_arg {
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@ -4,11 +4,6 @@ load("@protobuf//:protobuf.bzl", "cc_proto_library")
 load("@protobuf//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")

-# configure may change the following lines
-WITH_GCP_SUPPORT = False
-WITH_HDFS_SUPPORT = False
-WITH_JEMALLOC = True
-
 # Appends a suffix to a list of deps.
 def tf_deps(deps, suffix):
  tf_deps = []
@ -196,61 +191,54 @@ def tf_additional_test_srcs():
 def tf_kernel_tests_linkstatic():
  return 0

-# jemalloc only enabled on Linux for now.
-# TODO(jhseu): Enable on other platforms.
 def tf_additional_lib_defines():
-  defines = []
-  if WITH_JEMALLOC:
-    defines += select({
-        "//tensorflow:linux_x86_64": [
-            "TENSORFLOW_USE_JEMALLOC"
+  return select({
+      "//tensorflow:with_jemalloc": ["TENSORFLOW_USE_JEMALLOC"],
+      "//conditions:default": [],
+  })
+
+def tf_additional_lib_deps():
+  return select({
+      "//tensorflow:with_jemalloc": ["@jemalloc"],
+      "//conditions:default": [],
+  })
+
+def tf_additional_core_deps():
+  return select({
+      "//tensorflow:with_gcp_support": [
+          "//tensorflow/core/platform/cloud:gcs_file_system",
+      ],
+      "//conditions:default": [],
+  }) + select({
+      "//tensorflow:with_hdfs_support": [
+          "//tensorflow/core/platform/hadoop:hadoop_file_system",
      ],
      "//conditions:default": [],
  })
-  return defines
-
-def tf_additional_lib_deps():
-  deps = []
-  if WITH_JEMALLOC:
-    deps += select({
-        "//tensorflow:linux_x86_64": ["@jemalloc"],
-        "//conditions:default": [],
-    })
-  return deps
-
-def tf_additional_core_deps():
-  deps = []
-  if WITH_GCP_SUPPORT:
-    deps.append("//tensorflow/core/platform/cloud:gcs_file_system")
-  if WITH_HDFS_SUPPORT:
-    deps.append("//tensorflow/core/platform/hadoop:hadoop_file_system")
-  return deps

 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_op_deps():
-  deps = []
-  if WITH_GCP_SUPPORT:
-    deps = select({
+  return select({
      "//tensorflow:windows": [],
      "//tensorflow:android": [],
      "//tensorflow:ios": [],
-      "//conditions:default":
-        ["//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib"],
+      "//tensorflow:with_gcp_support": [
+        "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
+      ],
+      "//conditions:default": [],
  })
-  return deps

 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_kernel_deps():
-  deps = []
-  if WITH_GCP_SUPPORT:
-    deps = select({
+  return select({
      "//tensorflow:windows": [],
      "//tensorflow:android": [],
      "//tensorflow:ios": [],
-      "//conditions:default":
-        ["//tensorflow/contrib/cloud/kernels:bigquery_reader_ops"],
+      "//tensorflow:with_gcp_support": [
+        "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
+      ],
+      "//conditions:default": [],
  })
-  return deps

 def tf_lib_proto_parsing_deps():
  return [
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@ -2,8 +2,6 @@
 # The functions in this file might be referred by tensorflow.bzl. They have to
 # be separate to avoid cyclic references.

-WITH_XLA_SUPPORT = False
-
 def tf_cuda_tests_tags():
  return ["local"]

@ -11,16 +9,16 @@ def tf_sycl_tests_tags():
  return ["local"]

 def tf_additional_plugin_deps():
-  deps = []
-  if WITH_XLA_SUPPORT:
-    deps.append("//tensorflow/compiler/jit")
-  return deps
+  return select({
+      "//tensorflow:with_xla_support": ["//tensorflow/compiler/jit"],
+      "//conditions:default": [],
+  })

 def tf_additional_xla_deps_py():
  return []

 def tf_additional_license_deps():
-  licenses = []
-  if WITH_XLA_SUPPORT:
-    licenses.append("@llvm//:LICENSE.TXT")
-  return licenses
+  return select({
+      "//tensorflow:with_xla_support": ["@llvm//:LICENSE.TXT"],
+      "//conditions:default": [],
+  })
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@ -58,6 +58,7 @@ class LibHDFS {
  std::function<hdfsFS(hdfsBuilder*)> hdfsBuilderConnect;
  std::function<hdfsBuilder*()> hdfsNewBuilder;
  std::function<void(hdfsBuilder*, const char*)> hdfsBuilderSetNameNode;
+  std::function<int(const char*, char**)> hdfsConfGetStr;
  std::function<void(hdfsBuilder*, const char* kerbTicketCachePath)>
      hdfsBuilderSetKerbTicketCachePath;
  std::function<int(hdfsFS, hdfsFile)> hdfsCloseFile;
@ -85,6 +86,7 @@ class LibHDFS {
      BIND_HDFS_FUNC(hdfsBuilderConnect);
      BIND_HDFS_FUNC(hdfsNewBuilder);
      BIND_HDFS_FUNC(hdfsBuilderSetNameNode);
+      BIND_HDFS_FUNC(hdfsConfGetStr);
      BIND_HDFS_FUNC(hdfsBuilderSetKerbTicketCachePath);
      BIND_HDFS_FUNC(hdfsCloseFile);
      BIND_HDFS_FUNC(hdfsPread);
@ -147,6 +149,18 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
  hdfsBuilder* builder = hdfs_->hdfsNewBuilder();
  if (scheme == "file") {
    hdfs_->hdfsBuilderSetNameNode(builder, nullptr);
+  } else if (scheme == "viewfs") {
+    char *defaultFS = NULL;
+    hdfs_->hdfsConfGetStr("fs.defaultFS", &defaultFS);
+    StringPiece defaultScheme, defaultCluster, defaultPath;
+    io::ParseURI(defaultFS, &defaultScheme, &defaultCluster, &defaultPath);
+
+    if (scheme != defaultScheme || namenode != defaultCluster) {
+      return errors::Unimplemented("viewfs is only supported as a fs.defaultFS.");
+    }
+    // The default NameNode configuration will be used (from the XML configuration files). See:
+    // https://github.com/tensorflow/tensorflow/blob/v1.0.0/third_party/hadoop/hdfs.h#L259
+    hdfs_->hdfsBuilderSetNameNode(builder, "default");
  } else {
    hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str());
  }
@ -478,5 +492,6 @@ Status HadoopFileSystem::Stat(const string& fname, FileStatistics* stats) {
 }

 REGISTER_FILE_SYSTEM("hdfs", HadoopFileSystem);
+REGISTER_FILE_SYSTEM("viewfs", HadoopFileSystem);

 }  // namespace tensorflow
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@ -53,6 +53,17 @@ limitations under the License.
 #define TF_SCANF_ATTRIBUTE(string_index, first_to_check)
 #endif

+// Control visiblity outside .so
+#if defined(COMPILER_MSVC)
+# ifdef TF_COMPILE_LIBRARY
+#  define TF_EXPORT __declspec(dllexport)
+# else
+#  define TF_EXPORT __declspec(dllimport)
+# endif   // TF_COMPILE_LIBRARY
+#else
+# define TF_EXPORT __attribute__((visibility("default")))
+#endif  // COMPILER_MSVC
+
 // GCC can be told that a certain branch is not likely to be taken (for
 // instance, a CHECK failure), and use that information in static analysis.
 // Giving it this information can help it optimize for the common case in
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
 #define TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_

+// included so __cpuidex function is available for GETCPUID on Windows
+#include <intrin.h>
+
 // Byte order defines provided by gcc. MSVC doesn't define those so
 // we define them here.
 // We assume that all windows platform out there are little endian.
--- a/tensorflow/core/platform/windows/intrinsics_port.h
+++ b/tensorflow/core/platform/windows/intrinsics_port.h
@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"

 #define _mm_load_pd1 _mm_load1_pd
+
+// only define these intrinsics if immintrin.h doesn't have them (VS2015 and earlier)
+#if _MSC_VER < 1910
 static inline int
 _mm256_extract_epi32(__m256i a, const int i)
 {
@ -39,3 +42,4 @@ _mm256_insert_epi32(__m256i a, int b, const int i)
 }
 #endif
 #endif
+#endif
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@ -230,11 +230,9 @@ Status WindowsFileSystem::NewRandomAccessFile(
  result->reset();

  // Open the file for read-only random access
-  // Random access is to disable read-ahead as the system reads too much data
  // Open in async mode which makes Windows allow more parallelism even
  // if we need to do sync I/O on top of it.
-  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS |
-      FILE_FLAG_OVERLAPPED;
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_OVERLAPPED;
  // Shared access is necessary for tests to pass
  // almost all tests would work with a possible exception of fault_injection.
  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
@ -306,8 +304,8 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
  result->reset();
  Status s = Status::OK();

-  // Open the file for read-only random access
-  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS;
+  // Open the file for read-only
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY;

  // Open in async mode which makes Windows allow more parallelism even
  // if we need to do sync I/O on top of it.
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@ -0,0 +1,296 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
+#ifdef INTEL_MKL
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "third_party/mkl/include/mkl_service.h"
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+// The file contains a number of utility classes and functions used by MKL
+// enabled kernels
+
+namespace tensorflow {
+
+// This class encapsulates all the meta data that is associated with an MKL
+// tensor. A tensor is an MKL tensor if it was created as the result of an
+// MKL operation, and did not go through a conversion to a standard
+// Tensorflow tensor.
+
+class MklShape {
+ public:
+  MklShape() {}
+  TF_DISALLOW_COPY_AND_ASSIGN(MklShape);  // Cannot copy
+
+  ~MklShape() {
+    if (sizes_) delete[] sizes_;
+    if (strides_) delete[] strides_;
+    if (mklLayout_) CHECK_EQ(dnnLayoutDelete_F32(mklLayout_), E_SUCCESS);
+    if (tfLayout_) CHECK_EQ(dnnLayoutDelete_F32(tfLayout_), E_SUCCESS);
+  }
+
+  const bool IsMklTensor() const { return isMklTensor_; }
+
+  void SetMklTensor(const bool isMklTensor) { isMklTensor_ = isMklTensor; }
+
+  void SetMklLayout(const void* primitive, size_t resourceType) {
+    CHECK_EQ(
+        dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive,
+                                         (dnnResourceType_t)resourceType),
+        E_SUCCESS);
+  }
+
+  void SetTfLayout(const size_t dimension, const size_t* sizes,
+                   const size_t* strides) {
+    dimension_ = dimension;
+    if (dimension > 0) {  // MKl doesn't support dimension 0
+      sizes_ = new size_t[dimension];
+      strides_ = new size_t[dimension];
+
+      for (int ii = 0; ii < dimension; ii++) {
+        sizes_[ii] = sizes[ii];
+        strides_[ii] = strides[ii];
+      }
+      CHECK_EQ(dnnLayoutCreate_F32(&tfLayout_, dimension, sizes, strides),
+               E_SUCCESS);
+    }
+  }
+
+  const dnnLayout_t GetMklLayout() const { return mklLayout_; }
+  const dnnLayout_t GetTfLayout() const { return tfLayout_; }
+  const dnnLayout_t GetCurLayout() const {
+    return isMklTensor_ ? mklLayout_ : tfLayout_;
+  }
+  size_t GetDimension() const { return dimension_; }
+  const size_t* GetSizes() const { return sizes_; }
+  const size_t* GetStrides() const { return strides_; }
+
+  void GetConvertedFlatData(dnnLayout_t targetLayout, void* input,
+                            void* output) const {
+    dnnLayout_t curLayout;
+    if (isMklTensor_)
+      curLayout = mklLayout_;
+    else
+      curLayout = tfLayout_;
+    dnnPrimitive_t convert;
+    CHECK_EQ(dnnConversionCreate_F32(&convert, curLayout, targetLayout),
+             E_SUCCESS);
+    CHECK_EQ(dnnConversionExecute_F32(convert, input, output), E_SUCCESS);
+    CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS);
+  }
+
+// The following methods are used for serializing and de-serializing the
+// contents of the mklshape object.
+// The data is serialized in this order
+// isMklTensor_
+// dimension_
+// sizes
+// strides
+// mklLayout_
+// tfLayout_
+
+#define SIZE_OF_MKL_DNN_BUF \
+  (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
+                                            // serialize dnn_layout pointer
+
+// Size of buffer to hold the serialized object, the size is computed as follows
+// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes) + sizeof(strides)
+// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+
+#define SIZE_OF_MKL_SERIAL_DATA(dims) \
+  (2 * sizeof(size_t) + 2 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
+
+// First we need to define some macro for offsets into the serial buffer where
+// different elements of Mklshape is written/read from
+
+#define IS_MKL_TENSOR_OFFSET 0
+// Location from start of buffer where isMklTensor_ is serialized
+#define DIMS_OFFSET \
+  (IS_MKL_TENSOR_OFFSET + sizeof(size_t))  // Location of dimension_
+#define SIZES_OFFSET(dims) \
+  (DIMS_OFFSET +           \
+  sizeof(size_t))  // Location of sizes. Note dim is not used here, left here
+                    // to make macros consistent.
+#define STRIDES_OFFSET(dims) \
+  (SIZES_OFFSET(dims) + dims * sizeof(size_t))  // Location of strides
+#define MKL_LAYOUT_OFFSET(dims) \
+  (STRIDES_OFFSET(dims) + dims * sizeof(size_t))  // Location of mklLayout_
+#define TF_LAYOUT_OFFSET(dims) \
+  (MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)  // Location of tfLayout_
+
+  // TODO(agramesh1) make sure to create a const to share with rewrite pass
+  // for min size of MKL metadata tensor.
+
+  void DeSerializeMklShape(const unsigned char* buf, size_t buf_size) {
+    CHECK(buf_size >= sizeof(size_t)) << "Bufsize too small in DeSerialize";
+    // Make sure buffer holds at least  isMklTensor_
+    isMklTensor_ =
+        *reinterpret_cast<const size_t*>(buf + IS_MKL_TENSOR_OFFSET) != 0;
+
+    if (isMklTensor_) {  // If it is an MKL Tensor then read the rest
+      dimension_ = *(reinterpret_cast<const size_t*>(buf + DIMS_OFFSET));
+      CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_))
+          << "Bufsize too small in DeSerialize";
+      sizes_ = new size_t[dimension_];
+      strides_ = new size_t[dimension_];
+      for (int i = 0; i < dimension_; i++) {
+        sizes_[i] =
+            reinterpret_cast<const size_t*>(buf + SIZES_OFFSET(dimension_))[i];
+        strides_[i] = reinterpret_cast<const size_t*>(
+            buf + STRIDES_OFFSET(dimension_))[i];
+      }
+      CHECK_EQ(dnnLayoutDeserialize_F32(&mklLayout_,
+                                        buf + MKL_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+      CHECK_EQ(dnnLayoutDeserialize_F32(&tfLayout_,
+                                        buf + TF_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+    }
+  }
+
+  void SerializeMklShape(unsigned char* buf, size_t buf_size) const {
+    CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_))
+        << "Bufsize too small to Serialize";
+    *reinterpret_cast<size_t*>(buf + IS_MKL_TENSOR_OFFSET) =
+        isMklTensor_ ? 1 : 0;
+    if (isMklTensor_) {
+      *(reinterpret_cast<size_t*>(buf + DIMS_OFFSET)) = dimension_;
+      for (int i = 0; i < dimension_; i++) {
+        reinterpret_cast<size_t*>(buf + SIZES_OFFSET(dimension_))[i] =
+            sizes_[i];
+        reinterpret_cast<size_t*>(buf + STRIDES_OFFSET(dimension_))[i] =
+            strides_[i];
+      }
+      CHECK_EQ(dnnLayoutSerialize_F32(mklLayout_,
+                                      buf + MKL_LAYOUT_OFFSET(dimension_)),
+               E_SUCCESS);
+      CHECK_EQ(
+          dnnLayoutSerialize_F32(tfLayout_, buf + TF_LAYOUT_OFFSET(dimension_)),
+          E_SUCCESS);
+    }
+  }
+
+ private:
+  bool isMklTensor_ =
+      false;  // Flag to indicate if the tensor is an  MKL tensor or not
+  dnnLayout_t mklLayout_ = nullptr;  // Pointer to the MKL layout
+  dnnLayout_t tfLayout_ = nullptr;   // Pointer to layout of corresponding
+  // Tensorflow tensor, used when conversion from MKL to standard tensor
+  size_t dimension_ = 0;
+  size_t* sizes_ = nullptr;    // Required by MKL for conversions
+  size_t* strides_ = nullptr;  // Required by MKL for conversions
+};
+
+int inline GetTensorDataIndex(int n) {
+  return 2 * n;  // index corresponding to nth input/output tensor
+}
+
+int inline GetTensorMetaDataIndex(int n) {
+  // index corresponding to meta data of nth input/output tensor
+  return 2 * n + 1;
+}
+// Get the MKL shape from the second string tensor
+inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
+  mklshape->DeSerializeMklShape(
+      ctext->input(GetTensorMetaDataIndex(n)).flat<uint8>().data(),
+      ctext->input(GetTensorMetaDataIndex(n)).flat<uint8>().size() *
+          sizeof(uint8));
+}
+
+// Gets the actual input
+inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) {
+  return ctext->input(GetTensorDataIndex(n));
+}
+
+// Allocate the output tensor, create a second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklshape(OpKernelContext* ctext, int n,
+                                      Tensor** output,
+                                      const TensorShape& tfshape,
+                                      const MklShape& mklshape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mklshape.GetDimension()));
+  OP_REQUIRES_OK(
+      ctext, ctext->allocate_output(GetTensorDataIndex(n), tfshape, output));
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(GetTensorMetaDataIndex(n),
+                                               second_shape, &second_tensor));
+  mklshape.SerializeMklShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
+}
+
+// Allocates a temp tensor and returns the data buffer for temporary storage.
+// Currently
+// we only support F32, will need to templatize if other types are added
+inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
+                           dnnLayout_t lt_buff, void** buf_out) {
+  TensorShape tf_shape;
+
+  tf_shape.AddDim(
+      dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(lt_buff)) /
+          sizeof(float) +
+      1);
+  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::v(),
+                                                 tf_shape, tensor_out));
+  *buf_out = static_cast<void*>(tensor_out->flat<float>().data());
+}
+
+inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
+                                const size_t* sizes) {
+  // MKL requires strides in NCHW
+  if (data_format == FORMAT_NHWC) {
+    strides[0] = sizes[2];
+    strides[1] = sizes[0] * sizes[2];
+    strides[2] = 1;
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+  } else {
+    strides[0] = 1;
+    strides[1] = sizes[0];
+    strides[2] = sizes[0] * sizes[1];
+    strides[3] = sizes[0] * sizes[1] * sizes[2];
+  }
+}
+
+namespace mkl_layer_registry {
+
+static const char* kMklLayerLabel = "MklLayer";
+static const string kMklLayerLabelPattern = "label='MklLayer'";
+
+// Check whether opname is registered as MKL-compliant in the registry.
+//
+// @input: name of the op
+// @return: true if opname is registered as Mkl layer op
+static inline bool IsMklLayer(const std::string& op_name) {
+  string kernel = KernelsRegisteredForOp(op_name);
+  return kernel.find(kMklLayerLabelPattern) != string::npos;
+}
+
+} // namespace mkl_layer_registry
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@ -1160,7 +1160,9 @@ for ZeroOut:
 ```

 `c->set_output(0, c->input(0));` declares that the first output's shape should
-be set to the first input's shape. There are a number of common shape functions
+be set to the first input's shape. If the output is selected by its index as in the above example, the second parameter of `set_output` should be a `ShapeHandle` object. You can create an empty `ShapeHandle` object by its default constructor. The `ShapeHandle` object for an input with index `idx` can be obtained by `c->input(idx)`.
+
+There are a number of common shape functions
 that apply to many ops, such as `shape_inference::UnchangedShape` which can be
 found in [common_shape_fns.h](https://www.tensorflow.org/code/tensorflow/core/framework/common_shape_fns.h) and used as follows:

@ -1220,7 +1222,15 @@ particular dimension has a very specific value using `InferenceContext::Dim` and
 `InferenceContext::WithValue`; you can specify that an output dimension is the
 sum / product of two input dimensions using `InferenceContext::Add` and
 `InferenceContext::Multiply`. See the `InferenceContext` class for
-all of the various shape manipulations you can specify.
+all of the various shape manipulations you can specify. The following example sets
+shape of the first output to (n, 3), where first input has shape (n, ...)
+
+```c++
+.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+    c->set_output(0, c->Matrix(c->Dim(c->input(0), 0), 3));
+    return Status::OK();
+});
+```

 If you have a complicated shape function, you should consider adding a test for
 validating that various input shape combinations produce the expected output
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@ -374,7 +374,7 @@ estimator.fit(input_fn=input_fn, steps=1000)

 # Here we evaluate how well our model did. In a real example, we would want
 # to use a separate validation and testing data set to avoid overfitting.
-estimator.evaluate(input_fn=input_fn)
+print(estimator.evaluate(input_fn=input_fn))
 ```
 When run, it produces
 ```
--- a/tensorflow/docs_src/get_started/mnist/mechanics.md
+++ b/tensorflow/docs_src/get_started/mnist/mechanics.md
@ -351,7 +351,7 @@ training.

 ```python
 if step % 100 == 0:
-    print 'Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)
+    print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
 ```

 #### Visualize the Status
@ -421,19 +421,19 @@ the training and test datasets.  The `do_eval()` function is called thrice, for
 the training, validation, and test datasets.

 ```python
-print 'Training Data Eval:'
+print('Training Data Eval:')
 do_eval(sess,
        eval_correct,
        images_placeholder,
        labels_placeholder,
        data_sets.train)
-print 'Validation Data Eval:'
+print('Validation Data Eval:')
 do_eval(sess,
        eval_correct,
        images_placeholder,
        labels_placeholder,
        data_sets.validation)
-print 'Test Data Eval:'
+print('Test Data Eval:')
 do_eval(sess,
        eval_correct,
        images_placeholder,
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@ -92,12 +92,12 @@ two following snippets of code are equivalent:
 # Using `Session.run()`.
 sess = tf.Session()
 c = tf.constant(5.0)
-print sess.run(c)
+print(sess.run(c))

 # Using `Tensor.eval()`.
 c = tf.constant(5.0)
 with tf.Session():
-  print c.eval()
+  print(c.eval())
 ```

 In the second example, the session acts as a
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@ -144,6 +144,11 @@ specified list, of the variables in the graph.  The saver object provides
 methods to run these ops, specifying paths for the checkpoint files to write to
 or read from.

+Note that to restore a model checkpoint without a graph one must first import
+the graph from the meta graph file (typical extension is `.meta`). This is
+done with @{tf.train.import_meta_graph}, which in turn returns a `Saver` from
+which one can than perform a `restore`.
+
 ### Checkpoint Files

 Variables are saved in binary files that, roughly, contain a map from variable
--- a/tensorflow/docs_src/tutorials/linear.md
+++ b/tensorflow/docs_src/tutorials/linear.md
@ -217,7 +217,7 @@ results = e.evaluate(input_fn=input_fn_test, steps=1)

 # Print the stats for the evaluation.
 for key in sorted(results):
-    print "%s: %s" % (key, results[key])
+    print("%s: %s" % (key, results[key]))
 ```

 ### Wide and deep learning
--- a/tensorflow/docs_src/tutorials/using_gpu.md
+++ b/tensorflow/docs_src/tutorials/using_gpu.md
@ -28,7 +28,7 @@ c = tf.matmul(a, b)
 # Creates a session with log_device_placement set to True.
 sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
 # Runs the op.
-print sess.run(c)
+print(sess.run(c))
 ```

 You should see the following output:
@ -61,7 +61,7 @@ with tf.device('/cpu:0'):
 # Creates a session with log_device_placement set to True.
 sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
 # Runs the op.
-print sess.run(c)
+print(sess.run(c))
 ```

 You will see that now `a` and `b` are assigned to `cpu:0`.
@ -131,7 +131,7 @@ with tf.device('/gpu:2'):
 # Creates a session with log_device_placement set to True.
 sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
 # Runs the op.
-print sess.run(c)
+print(sess.run(c))
 ```

 If the device you have specified does not exist, you will get
@ -160,7 +160,7 @@ with tf.device('/gpu:2'):
 sess = tf.Session(config=tf.ConfigProto(
      allow_soft_placement=True, log_device_placement=True))
 # Runs the op.
-print sess.run(c)
+print(sess.run(c))
 ```

 ## Using multiple GPUs
@ -182,7 +182,7 @@ with tf.device('/cpu:0'):
 # Creates a session with log_device_placement set to True.
 sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
 # Runs the op.
-print sess.run(sum)
+print(sess.run(sum))
 ```

 You will see the following output.
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@ -188,7 +188,7 @@ def input_fn(df):
  categorical_cols = {k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)],
      values=df[k].values,
-      shape=[df[k].size, 1])
+      dense_shape=[df[k].size, 1])
                      for k in CATEGORICAL_COLUMNS}
  # Merges the two dictionaries into one.
  feature_cols = dict(continuous_cols.items() + categorical_cols.items())
@ -261,6 +261,8 @@ learned through the model training process we'll go through later.
 We'll do the similar trick to define the other categorical features:

 ```python
+race = tf.contrib.layers.sparse_column_with_hash_bucket("race", hash_bucket_size=100)
+marital_status = tf.contrib.layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100)
 relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
 workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
 occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
@ -377,7 +379,7 @@ the labels of the holdout data:
 ```python
 results = m.evaluate(input_fn=eval_input_fn, steps=1)
 for key in sorted(results):
-    print "%s: %s" % (key, results[key])
+    print("%s: %s" % (key, results[key]))
 ```

 The first line of the output should be something like `accuracy: 0.83557522`,
--- a/tensorflow/docs_src/tutorials/wide_and_deep.md
+++ b/tensorflow/docs_src/tutorials/wide_and_deep.md
@ -255,7 +255,7 @@ After reading in the data, you can train and evaluate the model:
 m.fit(input_fn=train_input_fn, steps=200)
 results = m.evaluate(input_fn=eval_input_fn, steps=1)
 for key in sorted(results):
-    print "%s: %s" % (key, results[key])
+    print("%s: %s" % (key, results[key]))
 ```

 The first line of the output should be something like `accuracy: 0.84429705`. We
--- a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
@ -432,7 +432,7 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
        // Everything else is 0, so just pick a suitable slider to push up when the
        // selected one goes down.
        if (adapter.items[lastOtherStyle] == slider) {
-          lastOtherStyle = lastOtherStyle + 1 % NUM_STYLES;
+          lastOtherStyle = (lastOtherStyle + 1) % NUM_STYLES;
        }
        adapter.items[lastOtherStyle].setValue(1.0f - value);
      }
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@ -1,7 +1,7 @@
 # TF Learn Examples

 Learn is a high-level API for TensorFlow that allows you to create,
-train, and use deep learning models easily. See the [Quickstart tutorial](../../g3doc/tutorials/tflearn/index.md)
+train, and use deep learning models easily. See the [Quickstart tutorial](https://www.tensorflow.org/get_started/tflearn)
 for an introduction to the API.

 To run most of these examples, you need to install the `scikit learn` library (`sudo pip install sklearn`).
--- a/tensorflow/examples/learn/boston.py
+++ b/tensorflow/examples/learn/boston.py
@ -16,19 +16,22 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from sklearn import cross_validation
+
+from sklearn import datasets
+from sklearn import model_selection
 from sklearn import metrics
 from sklearn import preprocessing
+
 import tensorflow as tf


 def main(unused_argv):
  # Load dataset
-  boston = tf.contrib.learn.datasets.load_dataset('boston')
+  boston = datasets.load_boston()
  x, y = boston.data, boston.target

  # Split dataset into train / test
-  x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+  x_train, x_test, y_train, y_test = model_selection.train_test_split(
      x, y, test_size=0.2, random_state=42)

  # Scale data (training set) to 0 mean and unit standard deviation.
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-
+from sklearn import datasets
 from sklearn import metrics
 from sklearn import model_selection

@ -26,7 +26,7 @@ import tensorflow as tf

 def main(unused_argv):
  # Load dataset.
-  iris = tf.contrib.learn.datasets.load_dataset('iris')
+  iris = datasets.load_iris()
  x_train, x_test, y_train, y_test = model_selection.train_test_split(
      iris.data, iris.target, test_size=0.2, random_state=42)

--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@ -24,6 +24,7 @@ import numpy as np
 import pandas
 from sklearn import metrics
 import tensorflow as tf
+from tensorflow.contrib.layers.python.layers import encoders

 learn = tf.contrib.learn

@ -37,7 +38,7 @@ n_words = 0
 def bag_of_words_model(features, target):
  """A bag-of-words model. Note it disregards the word order in the text."""
  target = tf.one_hot(target, 15, 1, 0)
-  features = tf.contrib.layers.bow_encoder(
+  features = encoders.bow_encoder(
      features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
  logits = tf.contrib.layers.fully_connected(features, 15, activation_fn=None)
  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
--- a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
+++ b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
@ -278,7 +278,7 @@
    "            tensor = n.attr['value'].tensor\n",
    "            size = len(tensor.tensor_content)\n",
    "            if size > max_const_size:\n",
-    "                tensor.tensor_content = bytes(\"<stripped %d bytes>\"%size, 'utf-8')\n",
+    "                tensor.tensor_content = bytes(\"<stripped %d bytes>\"%size)\n",
    "    return strip_def\n",
    "  \n",
    "def rename_nodes(graph_def, rename_func):\n",
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@ -62,7 +62,7 @@ print('Data size', len(words))
 vocabulary_size = 50000


-def build_dataset(words):
+def build_dataset(words, vocabulary_size):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
@ -81,7 +81,7 @@ def build_dataset(words):
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reverse_dictionary

-data, count, dictionary, reverse_dictionary = build_dataset(words)
+data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)
 del words  # Hint to reduce memory.
 print('Most common words (+UNK)', count[:5])
 print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
@ -181,7 +181,7 @@ with graph.as_default():
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Add variable initializer.
-  init = tf.initialize_all_variables()
+  init = tf.global_variables_initializer()

 # Step 5: Begin training.
 num_steps = 100001
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@ -20,11 +20,17 @@ go get github.com/golang/protobuf/proto
 go get github.com/golang/protobuf/protoc-gen-go

 cd $(dirname $0)
-TF_DIR=${GOPATH}/src/github.com/tensorflow/tensorflow
+for g in $(echo $GOPATH | sed "s/:/ /g"); do
+    TF_DIR="${g}/src/github.com/tensorflow/tensorflow"
    PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc"
+    if [ -x "${PROTOC}" ]; then
+        break
+    fi
+done

 if [ ! -x "${PROTOC}" ]
 then
+  set +e
  PATH_PROTOC=$(which protoc)
  if [ ! -x "${PATH_PROTOC}" ]
  then
@ -34,6 +40,7 @@ then
    exit 1
  fi
  PROTOC=$PATH_PROTOC
+  set -e
 fi

 # Ensure that protoc-gen-go is available in $PATH
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@ -110,7 +110,7 @@ libraries will need to be built from source.
    brew install swig
    ```

-3.  [Configure](https://www.tensorflow.org/get_started/os_setup#configure_the_installation)
+3.  [Configure](https://www.tensorflow.org/install/install_sources#configure_the_installation)
    (e.g., enable GPU support) and build:

    ```sh
@ -120,8 +120,8 @@ libraries will need to be built from source.
      //tensorflow/java:libtensorflow_jni
    ```

-The JAR (`libtensorflow.jar`) and native library (`libtensorflow_jni.so`) will 
-be in `bazel-bin/tensorflow/java`.
+The JAR (`libtensorflow.jar`) and native library (`libtensorflow_jni.so` on Linux or `libtensorflow_jni.dylib` on OS X) will 
+be in `bazel-bin/tensorflow/java`. Using these artifacts follow both steps 3 and 4 in the [quickstart](#quickstart) section in order to get your application up and running.

 ### Maven

--- a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
@ -27,7 +27,8 @@ package org.tensorflow;
 public class SavedModelBundle implements AutoCloseable {

  /**
-   * Load a saved model from an export directory.
+   * Load a saved model from an export directory. The model that is being loaded should be created using
+   * the <a href="https://www.tensorflow.org/api_docs/python/tf/saved_model">Saved Model API</a>.
   *
   * @param exportDir the directory path containing a saved model.
   * @param tags the tags identifying the specific metagraphdef to load.
--- a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java
@ -172,8 +172,7 @@ public final class Tensor implements AutoCloseable {
   *
   * <p>Creates a Tensor with the provided shape of any type where the tensor's data has been
   * encoded into {@code data} as per the specification of the TensorFlow <a
-   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C
-   * API</a>.
+   * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C API</a>.
   *
   * @param dataType the tensor datatype.
   * @param shape the tensor shape.
--- a/tensorflow/java/src/main/java/org/tensorflow/package-info.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
@ -19,8 +19,8 @@ limitations under the License.
 * <p><b>WARNING</b>: The API is currently experimental and is not covered by TensorFlow <a
 * href="https://www.tensorflow.org/programmers_guide/version_semantics">API stability
 * guarantees</a>. See <a
- * href="https://www.tensorflow.org/code/tensorflow/java/README.md">README.md</a>
- * for installation instructions.
+ * href="https://www.tensorflow.org/code/tensorflow/java/README.md">README.md</a> for installation
+ * instructions.
 *
 * <p>The <a
 * href="https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java">LabelImage</a>
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@ -711,14 +711,14 @@ class BaseSession(SessionInterface):
       # v is the numpy array [10, 20]
       # 'fetches' can be a list.
       v = session.run([a, b])
-       # v a Python list with 2 numpy arrays: the numpy array [10, 20] and the
+       # v is a Python list with 2 numpy arrays: the 1-D array [10, 20] and the
       # 1-D array [1.0, 2.0]
       # 'fetches' can be arbitrary lists, tuples, namedtuple, dicts:
       MyData = collections.namedtuple('MyData', ['a', 'b'])
       v = session.run({'k1': MyData(a, b), 'k2': [b, a]})
       # v is a dict with
-       # v['k1'] is a MyData namedtuple with 'a' the numpy array [10, 20] and
-       # 'b' the numpy array [1.0, 2.0]
+       # v['k1'] is a MyData namedtuple with 'a' (the numpy array [10, 20]) and
+       # 'b' (the numpy array [1.0, 2.0])
       # v['k2'] is a list with the numpy array [1.0, 2.0] and the numpy array
       # [10, 20].
    ```
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@ -15,6 +15,7 @@ exports_files(["LICENSE"])

 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")

 py_library(
    name = "debug_py",
@ -33,11 +34,12 @@ py_library(
 py_library(
    name = "debug_pip",
    deps = [
-        ":debug_examples",
        ":debug_py",
        ":offline_analyzer",
        ":session_debug_testlib",
-    ],
+    ] + if_not_windows([
+        ":debug_examples",
+    ]),
 )

 py_library(
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@ -84,9 +84,7 @@ class TensordotTest(test_lib.TestCase):
                                   b_ph: b,
                                   axes_ph: axes_value})

-  def test_no_partial_shape_inference(self):
-    # If one of the shapes is only partially defined, the output shape is
-    # unknown.
+  def test_partial_shape_inference(self):
    a = array_ops.placeholder(dtypes.float32)
    b = array_ops.placeholder(dtypes.float32)
    axes = ([1], [0])
@ -95,13 +93,21 @@ class TensordotTest(test_lib.TestCase):
    a.set_shape([None, 2])
    b.set_shape([2, 3])
    output = math_ops.tensordot(a, b, axes)
-    self.assertEqual(output.get_shape().ndims, None)
+    output_shape = output.get_shape()
+    self.assertEqual(output_shape.ndims, 2)
+    output_shape = output_shape.as_list()
+    self.assertEqual(output_shape[0], None)
+    self.assertEqual(output_shape[1], 3)
    a = array_ops.placeholder(dtypes.float32)
    b = array_ops.placeholder(dtypes.float32)
    a.set_shape([2, 2])
    b.set_shape([2, None])
    output = math_ops.tensordot(a, b, axes)
-    self.assertEqual(output.get_shape().ndims, None)
+    output_shape = output.get_shape()
+    self.assertEqual(output_shape.ndims, 2)
+    output_shape = output_shape.as_list()
+    self.assertEqual(output_shape[0], 2)
+    self.assertEqual(output_shape[1], None)


 def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
--- a/Show More
+++ b/Show More