diff --git a/RELEASE.md b/RELEASE.md
index 212e296271e..c0ca06e675f 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -17,6 +17,9 @@
   instead of graph.proto.
 * ops.NoGradient was renamed ops.NotDifferentiable. ops.NoGradient will
   be removed soon.
+* dot.h / DotGraph was removed (it was an early analysis tool prior
+  to TensorBoard, no longer that useful).  It remains in history
+  should someone find the code useful.
 
 # Release 0.10.0
 
diff --git a/avro.BUILD b/avro.BUILD
index 5e73c1a6783..f6d24afff5d 100644
--- a/avro.BUILD
+++ b/avro.BUILD
@@ -2,21 +2,19 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
-prefix_dir = "avro-cpp-1.8.0"
-
 cc_library(
     name = "avrocpp",
     srcs = glob(
         [
-            prefix_dir + "/impl/**/*.cc",
-            prefix_dir + "/impl/**/*.hh",
+            "impl/**/*.cc",
+            "impl/**/*.hh",
         ],
         exclude = [
-            prefix_dir + "/impl/avrogencpp.cc",
+            "impl/avrogencpp.cc",
         ],
     ),
-    hdrs = glob([prefix_dir + "/api/**/*.hh"]),
-    includes = [prefix_dir + "/api"],
+    hdrs = glob(["api/**/*.hh"]),
+    includes = ["api"],
     deps = [
         "@boost_archive//:boost",
         "@boost_archive//:filesystem",
@@ -27,7 +25,7 @@ cc_library(
 
 cc_binary(
     name = "avrogencpp",
-    srcs = [prefix_dir + "/impl/avrogencpp.cc"],
+    srcs = ["impl/avrogencpp.cc"],
     deps = [
         ":avrocpp",
         "@boost_archive//:program_options",
diff --git a/boost.BUILD b/boost.BUILD
index da47abdb6c0..c10d9eba476 100644
--- a/boost.BUILD
+++ b/boost.BUILD
@@ -10,21 +10,19 @@ package(default_visibility = ["@avro_archive//:__subpackages__"])
 
 licenses(["notice"])  # Boost software license
 
-prefix_dir = "boost_1_61_0"
-
 cc_library(
     name = "boost",
     hdrs = glob([
-        prefix_dir + "/boost/**/*.hpp",
-        prefix_dir + "/boost/**/*.h",
-        prefix_dir + "/boost/**/*.ipp",
+        "boost/**/*.hpp",
+        "boost/**/*.h",
+        "boost/**/*.ipp",
     ]),
-    includes = [prefix_dir],
+    includes = ["."],
 )
 
 cc_library(
     name = "filesystem",
-    srcs = glob([prefix_dir + "/libs/filesystem/src/*.cpp"]),
+    srcs = glob(["libs/filesystem/src/*.cpp"]),
     deps = [
         ":boost",
         ":system",
@@ -33,7 +31,7 @@ cc_library(
 
 cc_library(
     name = "iostreams",
-    srcs = glob([prefix_dir + "/libs/iostreams/src/*.cpp"]),
+    srcs = glob(["libs/iostreams/src/*.cpp"]),
     deps = [
         ":boost",
         "@bzip2_archive//:bz2lib",
@@ -43,16 +41,12 @@ cc_library(
 
 cc_library(
     name = "program_options",
-    srcs = glob([prefix_dir + "/libs/program_options/src/*.cpp"]),
-    deps = [
-        ":boost",
-    ],
+    srcs = glob(["libs/program_options/src/*.cpp"]),
+    deps = [":boost"],
 )
 
 cc_library(
     name = "system",
-    srcs = glob([prefix_dir + "/libs/system/src/*.cpp"]),
-    deps = [
-        ":boost",
-    ],
+    srcs = glob(["libs/system/src/*.cpp"]),
+    deps = [":boost"],
 )
diff --git a/bzip2.BUILD b/bzip2.BUILD
index 42e16df6716..8865054d70c 100644
--- a/bzip2.BUILD
+++ b/bzip2.BUILD
@@ -2,35 +2,27 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # BSD derivative
 
-prefix_dir = "bzip2-1.0.6"
-
-BZ2LIB_SRCS = [
-    # these are in the same order as their corresponding .o files are in OBJS in
-    # Makefile (rather than lexicographic order) for easy comparison (that they
-    # are identical).
-    "blocksort.c",
-    "huffman.c",
-    "crctable.c",
-    "randtable.c",
-    "compress.c",
-    "decompress.c",
-    "bzlib.c",
-]
-
 cc_library(
     name = "bz2lib",
-    srcs = [prefix_dir + "/" + source for source in BZ2LIB_SRCS] +
-        [prefix_dir + "/bzlib_private.h"],
-    hdrs = [prefix_dir + "/bzlib.h"],
-    includes = [prefix_dir],
+    srcs = [
+        # These are in the same order as their corresponding .o files are in
+        # OBJS in Makefile (rather than lexicographic order) for easy
+        # comparison (that they are identical.)
+        "blocksort.c",
+        "huffman.c",
+        "crctable.c",
+        "randtable.c",
+        "compress.c",
+        "decompress.c",
+        "bzlib.c",
+        "bzlib_private.h",
+    ],
+    hdrs = ["bzlib.h"],
+    includes = ["."],
 )
 
 cc_binary(
     name = "bzip2",
-    srcs = [
-        "bzip2.c",
-    ],
-    deps = [
-        ":bz2lib",
-    ],
+    srcs = ["bzip2.c"],
+    deps = [":bz2lib"],
 )
diff --git a/eigen.BUILD b/eigen.BUILD
index 8e964130169..8a699f6aa84 100644
--- a/eigen.BUILD
+++ b/eigen.BUILD
@@ -1,8 +1,70 @@
-package(default_visibility = ["//visibility:public"])
+# Description:
+#   Eigen is a C++ template library for linear algebra: vectors,
+#   matrices, and related algorithms.
+
+licenses([
+    # Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code.
+    #       We've taken special care to not reference any restricted code.
+    "reciprocal",  # MPL2
+    "notice",  # Portions BSD
+])
+
+# License-restricted (i.e. not reciprocal or notice) files inside Eigen/...
+EIGEN_RESTRICTED_FILES = [
+    "Eigen/src/OrderingMethods/Amd.h",
+    "Eigen/src/SparseCholesky/**",
+]
+
+# Notable transitive dependencies of restricted files inside Eigen/...
+EIGEN_RESTRICTED_DEPS = [
+    "Eigen/Eigen",
+    "Eigen/IterativeLinearSolvers",
+    "Eigen/MetisSupport",
+    "Eigen/Sparse",
+    "Eigen/SparseCholesky",
+    "Eigen/SparseLU",
+]
+
+# Note: unsupported/Eigen is unsupported and might go away at any time.
+EIGEN_FILES = [
+    "Eigen/**",
+    "unsupported/Eigen/CXX11/**",
+    "unsupported/Eigen/FFT",
+    "unsupported/Eigen/KroneckerProduct",
+    "unsupported/Eigen/src/FFT/**",
+    "unsupported/Eigen/src/KroneckerProduct/**",
+    "unsupported/Eigen/MatrixFunctions",
+    "unsupported/Eigen/SpecialFunctions",
+    "unsupported/Eigen/src/SpecialFunctions/**",
+]
+
+# List of files picked up by glob but actually part of another target.
+EIGEN_EXCLUDE_FILES = [
+    "Eigen/src/Core/arch/AVX/PacketMathGoogleTest.cc",
+]
+
+# Files known to be under MPL2 license.
+EIGEN_MPL2_HEADER_FILES = glob(
+    EIGEN_FILES,
+    exclude = EIGEN_EXCLUDE_FILES +
+              EIGEN_RESTRICTED_FILES +
+              EIGEN_RESTRICTED_DEPS + [
+        # Guarantees any file missed by excludes above will not compile.
+        "Eigen/src/Core/util/NonMPL2.h",
+        "Eigen/**/CMakeLists.txt",
+    ],
+)
 
 cc_library(
     name = "eigen",
-    hdrs = glob(["**/*.h", "unsupported/Eigen/*", "unsupported/Eigen/CXX11/*", "Eigen/*"]),
-    includes = [ '.' ],
+    hdrs = EIGEN_MPL2_HEADER_FILES,
+    defines = [
+        # This define (mostly) guarantees we don't link any problematic
+        # code. We use it, but we do not rely on it, as evidenced above.
+        "EIGEN_MPL2_ONLY",
+        # TODO(jart): Use EIGEN_USE_NONBLOCKING_THREAD_POOL but first add an
+        #             eigen_initialize.cc file and alwayslink=1.
+    ],
+    includes = ["."],
     visibility = ["//visibility:public"],
 )
diff --git a/farmhash.BUILD b/farmhash.BUILD
index fe19e144303..8111cd61f9f 100644
--- a/farmhash.BUILD
+++ b/farmhash.BUILD
@@ -1,21 +1,9 @@
-package(default_visibility = ["//visibility:public"])
-
-prefix_dir = "farmhash-34c13ddfab0e35422f4c3979f360635a8c050260"
-
-genrule(
-    name = "configure",
-    srcs = glob(
-        ["**/*"],
-        exclude = [prefix_dir + "/config.h"],
-    ),
-    outs = [prefix_dir + "/config.h"],
-    cmd = "pushd external/farmhash_archive/%s; workdir=$$(mktemp -d -t tmp.XXXXXXXXXX); cp -a * $$workdir; pushd $$workdir; ./configure; popd; popd; cp $$workdir/config.h $(@D); rm -rf $$workdir;" % prefix_dir,
-)
+licenses(["notice"])  # MIT
 
 cc_library(
     name = "farmhash",
-    srcs = [prefix_dir + "/src/farmhash.cc"],
-    hdrs = [prefix_dir + "/src/farmhash.h"] + [":configure"],
-    includes = [prefix_dir],
-    visibility = ["//visibility:public"]
+    srcs = ["farmhash.cc"],
+    hdrs = ["farmhash.h"],
+    includes = ["."],
+    visibility = ["//visibility:public"],
 )
diff --git a/gif.BUILD b/gif.BUILD
index 892e109e7dd..22ccda52e42 100644
--- a/gif.BUILD
+++ b/gif.BUILD
@@ -1,65 +1,44 @@
-SOURCES = [
-    "dgif_lib.c",
-    "egif_lib.c",
-    "gif_font.c",
-    "gif_hash.c",
-    "gifalloc.c",
-    "openbsd-reallocarray.c",
-    "gif_err.c",
-    "quantize.c",
-]
+# Description:
+#   A library for decoding and encoding GIF images
 
-HEADERS = [
-    "gif_hash.h",
-    "gif_lib.h",
-    "gif_lib_private.h",
-]
-
-config_setting(
-        name = "windows",
-        values = {
-            "cpu": "x64_windows_msvc",
-        },
-        visibility = ["//visibility:public"],
-)
-
-prefix_dir = "giflib-5.1.4/lib"
-prefix_dir_windows = "windows/giflib-5.1.4/lib"
-
-genrule(
-  name = "srcs_without_unistd",
-  srcs = [prefix_dir + "/" + source for source in SOURCES],
-  outs = [prefix_dir_windows + "/" + source for source in SOURCES],
-  cmd = "for f in $(SRCS); do " +
-        "  sed 's/#include <unistd.h>//g' $$f > $(@D)/%s/$$(basename $$f);" % prefix_dir_windows +
-        "done",
-)
-
-genrule(
-  name = "hdrs_without_unistd",
-  srcs = [prefix_dir + "/" + hdrs for hdrs in HEADERS],
-  outs = [prefix_dir_windows + "/" + hdrs for hdrs in HEADERS],
-  cmd = "for f in $(SRCS); do " +
-        "  sed 's/#include <unistd.h>//g' $$f > $(@D)/%s/$$(basename $$f);" % prefix_dir_windows +
-        "done",
-)
+licenses(["notice"])  # MIT
 
 cc_library(
     name = "gif",
-    srcs = select({
-        "//conditions:default" : [prefix_dir + "/" + source for source in SOURCES],
-        ":windows" : [":srcs_without_unistd"],
-    }),
-    hdrs = select({
-        "//conditions:default" : [prefix_dir + "/" + hdrs for hdrs in HEADERS],
-        ":windows" : [":hdrs_without_unistd"],
-    }),
-    includes = select({
-        "//conditions:default" : [prefix_dir],
-        ":windows" : [prefix_dir_windows],
-    }),
-    defines = [
-        "HAVE_CONFIG_H",
+    srcs = [
+        "dgif_lib.c",
+        "egif_lib.c",
+        "gif_err.c",
+        "gif_font.c",
+        "gif_hash.c",
+        "gif_hash.h",
+        "gif_lib_private.h",
+        "gifalloc.c",
+        "openbsd-reallocarray.c",
+        "quantize.c",
     ],
+    hdrs = ["gif_lib.h"],
+    includes = ["."],
     visibility = ["//visibility:public"],
+    deps = select({
+        ":windows": [":windows_polyfill"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "windows_polyfill",
+    hdrs = ["windows/unistd.h"],
+    includes = ["windows"],
+)
+
+genrule(
+    name = "windows_unistd_h",
+    outs = ["windows/unistd.h"],
+    cmd = "touch $@",
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows_msvc"},
 )
diff --git a/gmock.BUILD b/gmock.BUILD
index 82abf275408..66ed60750d2 100644
--- a/gmock.BUILD
+++ b/gmock.BUILD
@@ -1,19 +1,25 @@
+# Description:
+#   Google C++ Mocking Framework, a library for creating and using C++
+#   mock classes.
+
+licenses(["notice"])  # 3-clause BSD
+
 cc_library(
     name = "gtest",
     srcs = [
-        "gmock-1.7.0/gtest/src/gtest-all.cc",
-        "gmock-1.7.0/src/gmock-all.cc",
+        "gtest/src/gtest-all.cc",
+        "src/gmock-all.cc",
     ],
     hdrs = glob([
-        "gmock-1.7.0/**/*.h",
-        "gmock-1.7.0/gtest/src/*.cc",
-        "gmock-1.7.0/src/*.cc",
+        "**/*.h",
+        "gtest/src/*.cc",
+        "src/*.cc",
     ]),
     includes = [
-        "gmock-1.7.0",
-        "gmock-1.7.0/gtest",
-        "gmock-1.7.0/gtest/include",
-        "gmock-1.7.0/include",
+        ".",
+        "gtest",
+        "gtest/include",
+        "include",
     ],
     linkopts = ["-pthread"],
     visibility = ["//visibility:public"],
@@ -21,7 +27,7 @@ cc_library(
 
 cc_library(
     name = "gtest_main",
-    srcs = ["gmock-1.7.0/src/gmock_main.cc"],
+    srcs = ["src/gmock_main.cc"],
     linkopts = ["-pthread"],
     visibility = ["//visibility:public"],
     deps = [":gtest"],
diff --git a/grpc.BUILD b/grpc.BUILD
index c1404b54fb1..14e8daff5e9 100644
--- a/grpc.BUILD
+++ b/grpc.BUILD
@@ -3,6 +3,7 @@
 # ...with small modifications to fix the build rules for :grpc++_unsecure.
 #
 # TODO(mrry): Upstream these fixes back to the gRPC repository.
+# TODO(jart): Fix nanopb's BUILD file. Fix grpc BUILD file.
 
 # GRPC Bazel BUILD file.
 # This currently builds C, C++ and Objective-C code.
@@ -44,9 +45,26 @@ licenses(["notice"])  # 3-clause BSD
 
 package(default_visibility = ["//visibility:public"])
 
+genrule(
+    name = "pb_h",
+    outs = ["third_party/nanopb/pb.h"],
+    cmd = "echo '#include <pb.h>' >$@",
+    visibility = ["//visibility:private"],
+)
 
+genrule(
+    name = "pb_decode_h",
+    outs = ["third_party/nanopb/pb_decode.h"],
+    cmd = "echo '#include <pb_decode.h>' >$@",
+    visibility = ["//visibility:private"],
+)
 
-
+genrule(
+    name = "pb_encode_h",
+    outs = ["third_party/nanopb/pb_encode.h"],
+    cmd = "echo '#include <pb_encode.h>' >$@",
+    visibility = ["//visibility:private"],
+)
 
 cc_library(
   name = "gpr",
@@ -499,6 +517,9 @@ cc_library(
     "src/core/ext/census/placeholders.c",
     "src/core/ext/census/tracing.c",
     "src/core/plugin_registry/grpc_plugin_registry.c",
+    "third_party/nanopb/pb.h",
+    "third_party/nanopb/pb_decode.h",
+    "third_party/nanopb/pb_encode.h",
   ],
   hdrs = [
     "include/grpc/byte_buffer.h",
@@ -856,6 +877,9 @@ cc_library(
     "src/core/lib/tsi/ssl_transport_security.c",
     "src/core/lib/tsi/transport_security.c",
     "src/core/plugin_registry/grpc_cronet_plugin_registry.c",
+    "third_party/nanopb/pb.h",
+    "third_party/nanopb/pb_decode.h",
+    "third_party/nanopb/pb_encode.h",
   ],
   hdrs = [
     "include/grpc/byte_buffer.h",
@@ -1185,6 +1209,9 @@ cc_library(
     "src/core/ext/census/placeholders.c",
     "src/core/ext/census/tracing.c",
     "src/core/plugin_registry/grpc_unsecure_plugin_registry.c",
+    "third_party/nanopb/pb.h",
+    "third_party/nanopb/pb_decode.h",
+    "third_party/nanopb/pb_encode.h",
   ],
   hdrs = [
     "include/grpc/byte_buffer.h",
@@ -2313,6 +2340,9 @@ objc_library(
     "src/core/ext/census/grpc_filter.h",
     "src/core/ext/census/mlog.h",
     "src/core/ext/census/rpc_metric_id.h",
+    "third_party/nanopb/pb.h",
+    "third_party/nanopb/pb_decode.h",
+    "third_party/nanopb/pb_encode.h",
   ],
   includes = [
     "include",
diff --git a/jpeg.BUILD b/jpeg.BUILD
index ad9e44363c5..92c9ddcacf8 100644
--- a/jpeg.BUILD
+++ b/jpeg.BUILD
@@ -1,83 +1,89 @@
-SOURCES = [
-    "jaricom.c",
-    "jcapimin.c",
-    "jcapistd.c",
-    "jcarith.c",
-    "jccoefct.c",
-    "jccolor.c",
-    "jcdctmgr.c",
-    "jchuff.c",
-    "jcinit.c",
-    "jcmainct.c",
-    "jcmarker.c",
-    "jcmaster.c",
-    "jcomapi.c",
-    "jcparam.c",
-    "jcprepct.c",
-    "jcsample.c",
-    "jctrans.c",
-    "jdarith.c",
-    "jdapimin.c",
-    "jdapistd.c",
-    "jdatadst.c",
-    "jdatasrc.c",
-    "jdcoefct.c",
-    "jdcolor.c",
-    "jddctmgr.c",
-    "jdhuff.c",
-    "jdinput.c",
-    "jdmainct.c",
-    "jdmarker.c",
-    "jdmaster.c",
-    "jdmerge.c",
-    "jdpostct.c",
-    "jdsample.c",
-    "jdtrans.c",
-    "jerror.c",
-    "jfdctflt.c",
-    "jfdctfst.c",
-    "jfdctint.c",
-    "jidctflt.c",
-    "jidctfst.c",
-    "jidctint.c",
-    "jmemmgr.c",
-    "jmemnobs.c",
-    "jquant1.c",
-    "jquant2.c",
-    "jutils.c",
-]
+# Description:
+#   The Independent JPEG Group's JPEG runtime library.
 
-HEADERS = [
-    "cderror.h",
-    "cdjpeg.h",
-    "jconfig.h",
-    "jdct.h",
-    "jerror.h",
-    "jinclude.h",
-    "jmemsys.h",
-    "jmorecfg.h",
-    "jpegint.h",
-    "jpeglib.h",
-    "jversion.h",
-    "transupp.h",
-]
-
-prefix_dir = "jpeg-9a"
-
-genrule(
-    name = "configure",
-    srcs = glob(
-        ["**/*"],
-        exclude = [prefix_dir + "/jconfig.h"],
-    ),
-    outs = [prefix_dir + "/jconfig.h"],
-    cmd = "pushd external/jpeg_archive/%s; workdir=$$(mktemp -d -t tmp.XXXXXXXXXX); cp -a * $$workdir; pushd $$workdir; ./configure; popd; popd; cp $$workdir/jconfig.h $(@D); rm -rf $$workdir;" % prefix_dir,
-)
+licenses(["notice"])  # custom notice-style license, see LICENSE
 
 cc_library(
     name = "jpeg",
-    srcs = [prefix_dir + "/" + source for source in SOURCES],
-    hdrs = glob(["**/*.h"]) + [":configure"],
-    includes = [prefix_dir],
+    srcs = [
+        "cderror.h",
+        "cdjpeg.h",
+        "jaricom.c",
+        "jcapimin.c",
+        "jcapistd.c",
+        "jcarith.c",
+        "jccoefct.c",
+        "jccolor.c",
+        "jcdctmgr.c",
+        "jchuff.c",
+        "jcinit.c",
+        "jcmainct.c",
+        "jcmarker.c",
+        "jcmaster.c",
+        "jcomapi.c",
+        "jconfig.h",
+        "jcparam.c",
+        "jcprepct.c",
+        "jcsample.c",
+        "jctrans.c",
+        "jdapimin.c",
+        "jdapistd.c",
+        "jdarith.c",
+        "jdatadst.c",
+        "jdatasrc.c",
+        "jdcoefct.c",
+        "jdcolor.c",
+        "jdct.h",
+        "jddctmgr.c",
+        "jdhuff.c",
+        "jdinput.c",
+        "jdmainct.c",
+        "jdmarker.c",
+        "jdmaster.c",
+        "jdmerge.c",
+        "jdpostct.c",
+        "jdsample.c",
+        "jdtrans.c",
+        "jerror.c",
+        "jfdctflt.c",
+        "jfdctfst.c",
+        "jfdctint.c",
+        "jidctflt.c",
+        "jidctfst.c",
+        "jidctint.c",
+        "jinclude.h",
+        "jmemmgr.c",
+        "jmemnobs.c",
+        "jmemsys.h",
+        "jmorecfg.h",
+        "jquant1.c",
+        "jquant2.c",
+        "jutils.c",
+        "jversion.h",
+        "transupp.h",
+    ],
+    hdrs = [
+        "jerror.h",
+        "jpegint.h",
+        "jpeglib.h",
+    ],
+    includes = ["."],
     visibility = ["//visibility:public"],
 )
+
+genrule(
+    name = "configure",
+    outs = ["jconfig.h"],
+    cmd = "cat <<EOF >$@\n" +
+          "#define HAVE_PROTOTYPES 1\n" +
+          "#define HAVE_UNSIGNED_CHAR 1\n" +
+          "#define HAVE_UNSIGNED_SHORT 1\n" +
+          "#define HAVE_STDDEF_H 1\n" +
+          "#define HAVE_STDLIB_H 1\n" +
+          "#ifdef WIN32\n" +
+          "#define INLINE __inline\n" +
+          "#else\n" +
+          "#define INLINE __inline__\n" +
+          "#endif\n" +
+          "EOF\n",
+)
diff --git a/jsoncpp.BUILD b/jsoncpp.BUILD
index 2bb2e19a67f..765bf15129a 100644
--- a/jsoncpp.BUILD
+++ b/jsoncpp.BUILD
@@ -1,34 +1,31 @@
-licenses(["notice"])  # MIT
-
-JSON_HEADERS = [
-    "include/json/assertions.h",
-    "include/json/autolink.h",
-    "include/json/config.h",
-    "include/json/features.h",
-    "include/json/forwards.h",
-    "include/json/json.h",
-    "src/lib_json/json_batchallocator.h",
-    "include/json/reader.h",
-    "include/json/value.h",
-    "include/json/writer.h",
-]
-
-JSON_SOURCES = [
-    "src/lib_json/json_reader.cpp",
-    "src/lib_json/json_value.cpp",
-    "src/lib_json/json_writer.cpp",
-    "src/lib_json/json_tool.h",
-]
-
-INLINE_SOURCES = [
-    "src/lib_json/json_valueiterator.inl",
-]
+licenses(["unencumbered"])  # Public Domain or MIT
 
 cc_library(
     name = "jsoncpp",
-    srcs = JSON_SOURCES,
-    hdrs = JSON_HEADERS,
+    srcs = [
+        "include/json/assertions.h",
+        "src/lib_json/json_batchallocator.h",
+        "src/lib_json/json_reader.cpp",
+        "src/lib_json/json_tool.h",
+        "src/lib_json/json_value.cpp",
+        "src/lib_json/json_writer.cpp",
+    ],
+    hdrs = [
+        "include/json/autolink.h",
+        "include/json/config.h",
+        "include/json/features.h",
+        "include/json/forwards.h",
+        "include/json/json.h",
+        "include/json/reader.h",
+        "include/json/value.h",
+        "include/json/writer.h",
+    ],
     includes = ["include"],
-    textual_hdrs = INLINE_SOURCES,
     visibility = ["//visibility:public"],
+    deps = [":private"],
+)
+
+cc_library(
+    name = "private",
+    textual_hdrs = ["src/lib_json/json_valueiterator.inl"],
 )
diff --git a/linenoise.BUILD b/linenoise.BUILD
new file mode 100644
index 00000000000..9924a620f24
--- /dev/null
+++ b/linenoise.BUILD
@@ -0,0 +1,13 @@
+licenses(["notice"])  # 2-clause BSD
+
+exports_files(["LICENSE"])
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "linenoise",
+    srcs = ["linenoise.c"],
+    hdrs = ["linenoise.h"],
+)
diff --git a/nanopb.BUILD b/nanopb.BUILD
index fedaf1bd471..8b428689e1e 100644
--- a/nanopb.BUILD
+++ b/nanopb.BUILD
@@ -1,19 +1,21 @@
-SOURCES = [
-    "pb_common.c",
-    "pb_decode.c",
-    "pb_encode.c",
-]
+# Description:
+#   Nanopb, a tiny ANSI C protobuf implementation for use on embedded devices.
 
-HEADERS = [
-    "pb.h",
-    "pb_common.h",
-    "pb_decode.h",
-    "pb_encode.h",
-]
+licenses(["notice"])  # zlib license
 
 cc_library(
     name = "nanopb",
-    srcs = SOURCES,
-    hdrs = HEADERS,
+    srcs = [
+        "pb_common.c",
+        "pb_decode.c",
+        "pb_encode.c",
+    ],
+    hdrs = [
+        "pb.h",
+        "pb_common.h",
+        "pb_decode.h",
+        "pb_encode.h",
+    ],
+    includes = ["."],
     visibility = ["//visibility:public"],
 )
diff --git a/png.BUILD b/png.BUILD
index 1ecf1504d99..9ff982bc902 100644
--- a/png.BUILD
+++ b/png.BUILD
@@ -1,40 +1,33 @@
-package(default_visibility = ["//visibility:public"])
+# Description:
+#   libpng is the official PNG reference library.
 
-prefix_dir = "libpng-1.2.53"
-
-PNG_SOURCES = [
-    "png.c",
-    "pngerror.c",
-    "pngget.c",
-    "pngmem.c",
-    "pngpread.c",
-    "pngread.c",
-    "pngrio.c",
-    "pngrtran.c",
-    "pngrutil.c",
-    "pngset.c",
-    "pngtrans.c",
-    "pngwio.c",
-    "pngwrite.c",
-    "pngwtran.c",
-    "pngwutil.c",
-]
-
-genrule(
-    name = "configure",
-    srcs = glob(
-        ["**/*"],
-        exclude = [prefix_dir + "/config.h"],
-    ),
-    outs = [prefix_dir + "/config.h"],
-    cmd = "pushd external/png_archive/%s; workdir=$$(mktemp -d -t tmp.XXXXXXXXXX); cp -a * $$workdir; pushd $$workdir; ./configure --enable-shared=no --with-pic=no; popd; popd; cp $$workdir/config.h $(@D); rm -rf $$workdir;" % prefix_dir,
-)
+licenses(["notice"])  # BSD/MIT-like license
 
 cc_library(
     name = "png",
-    srcs = [prefix_dir + "/" + source for source in PNG_SOURCES],
-    hdrs = glob(["**/*.h"]) + [":configure"],
-    includes = [prefix_dir],
-    linkopts = ["-lz"],
+    srcs = [
+        "png.c",
+        "pngerror.c",
+        "pngget.c",
+        "pngmem.c",
+        "pngpread.c",
+        "pngread.c",
+        "pngrio.c",
+        "pngrtran.c",
+        "pngrutil.c",
+        "pngset.c",
+        "pngtrans.c",
+        "pngwio.c",
+        "pngwrite.c",
+        "pngwtran.c",
+        "pngwutil.c",
+    ],
+    hdrs = [
+        "png.h",
+        "pngconf.h",
+    ],
+    includes = ["."],
+    linkopts = ["-lm"],
     visibility = ["//visibility:public"],
+    deps = ["@zlib_archive//:zlib"],
 )
diff --git a/six.BUILD b/six.BUILD
index 5047a452e41..fd3d0cc16f4 100644
--- a/six.BUILD
+++ b/six.BUILD
@@ -1,13 +1,12 @@
-genrule(
-    name = "copy_six",
-    srcs = ["six-1.10.0/six.py"],
-    outs = ["six.py"],
-    cmd = "cp $< $(@)",
-)
+# Description:
+#   Six provides simple utilities for wrapping over differences between Python 2
+#   and Python 3.
+
+licenses(["notice"])  # MIT
 
 py_library(
     name = "six",
     srcs = ["six.py"],
-    visibility = ["//visibility:public"],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 8e485e66a93..0aa76448e9e 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -93,10 +93,12 @@ filegroup(
         ":all_files",
         "//tensorflow/c:all_files",
         "//tensorflow/cc:all_files",
+        "//tensorflow/cc/saved_model:all_files",
         "//tensorflow/contrib:all_files",
         "//tensorflow/contrib/android:all_files",
         "//tensorflow/contrib/bayesflow:all_files",
         "//tensorflow/contrib/copy_graph:all_files",
+        "//tensorflow/contrib/crf:all_files",
         "//tensorflow/contrib/cudnn_rnn:all_files",
         "//tensorflow/contrib/distributions:all_files",
         "//tensorflow/contrib/factorization:all_files",
@@ -129,7 +131,11 @@ filegroup(
         "//tensorflow/contrib/slim/python/slim/nets:all_files",
         "//tensorflow/contrib/tensor_forest:all_files",
         "//tensorflow/contrib/tensor_forest/hybrid:all_files",
+        "//tensorflow/contrib/tensorboard:all_files",
         "//tensorflow/contrib/testing:all_files",
+        "//tensorflow/contrib/tfprof/python/tools/tfprof:all_files",
+        "//tensorflow/contrib/tfprof/tools/tfprof:all_files",
+        "//tensorflow/contrib/tfprof/tools/tfprof/internal:all_files",
         "//tensorflow/contrib/training:all_files",
         "//tensorflow/contrib/util:all_files",
         "//tensorflow/core:all_files",
@@ -142,6 +148,7 @@ filegroup(
         "//tensorflow/core/platform/default/build_config:all_files",
         "//tensorflow/core/platform/hadoop:all_files",
         "//tensorflow/core/util/ctc:all_files",
+        "//tensorflow/core/util/tensor_bundle:all_files",
         "//tensorflow/examples/android:all_files",
         "//tensorflow/examples/how_tos/reading_data:all_files",
         "//tensorflow/examples/image_retraining:all_files",
@@ -166,6 +173,7 @@ filegroup(
         "//tensorflow/python/debug:all_files",
         "//tensorflow/python/kernel_tests:all_files",
         "//tensorflow/python/saved_model:all_files",
+        "//tensorflow/python/saved_model/example:all_files",
         "//tensorflow/python/tools:all_files",
         "//tensorflow/tensorboard:all_files",
         "//tensorflow/tensorboard/app:all_files",
@@ -176,7 +184,6 @@ filegroup(
         "//tensorflow/tensorboard/lib:all_files",
         "//tensorflow/tensorboard/lib/python:all_files",
         "//tensorflow/tensorboard/scripts:all_files",
-        "//tensorflow/third_party/hadoop:all_files",
         "//tensorflow/tools/dist_test/server:all_files",
         "//tensorflow/tools/docker:all_files",
         "//tensorflow/tools/docker/notebooks:all_files",
@@ -185,6 +192,7 @@ filegroup(
         "//tensorflow/tools/proto_text:all_files",
         "//tensorflow/tools/test:all_files",
         "//tensorflow/user_ops:all_files",
+        "//third_party/hadoop:all_files",
     ],
     visibility = [":__subpackages__"],
 )
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index a554501f172..f4023c5da2b 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -1574,6 +1575,40 @@ void TF_GraphToGraphDef(TF_Graph* graph, TF_Buffer* output_graph_def,
   status->status = MessageToBuffer(def, output_graph_def);
 }
 
+struct TF_ImportGraphDefOptions {
+  tensorflow::ImportGraphDefOptions opts;
+};
+
+TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions() {
+  return new TF_ImportGraphDefOptions;
+}
+void TF_DeleteImportGraphDefOptions(TF_ImportGraphDefOptions* opts) {
+  delete opts;
+}
+void TF_ImportGraphDefOptionsSetPrefix(TF_ImportGraphDefOptions* opts,
+                                       const char* prefix) {
+  opts->opts.prefix = prefix;
+}
+
+void TF_GraphImportGraphDef(TF_Graph* graph, const TF_Buffer* graph_def,
+                            const TF_ImportGraphDefOptions* opts,
+                            TF_Status* status) {
+  GraphDef def;
+  if (!def.ParseFromArray(graph_def->data, graph_def->length)) {
+    status->status = InvalidArgument("Invalid GraphDef");
+    return;
+  }
+  mutex_lock l(graph->mu);
+  const int last_node_id = graph->graph.num_node_ids();
+  status->status = tensorflow::ImportGraphDef(opts->opts, def, &graph->graph,
+                                              &graph->refiner);
+  if (!status->status.ok()) return;
+  for (int i = last_node_id; i < graph->graph.num_node_ids(); ++i) {
+    auto* node = graph->graph.FindNodeId(i);
+    if (node != nullptr) graph->name_map[node->name()] = node;
+  }
+}
+
 // TF_SessionWithGraph functions ----------------------------------------------
 
 TF_SessionWithGraph* TF_NewSessionWithGraph(TF_Graph* graph,
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 8efab40df45..4cfb753c5c5 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -739,20 +739,36 @@ extern TF_Operation* TF_GraphOperationByName(TF_Graph* graph,
 // }
 extern TF_Operation* TF_GraphNextOperation(TF_Graph* graph, size_t* pos);
 
-// Note: The following two functions may fail on very large protos in the
-// future.
-
+// Write out a serialized representation of `graph` (as a GraphDef protocol
+// message) to `output_graph_def`.
+//
+// May fail on very large graphs in the future.
 extern void TF_GraphToGraphDef(TF_Graph* graph, TF_Buffer* output_graph_def,
                                TF_Status* status);
 
+// TF_ImportGraphDefOptions holds options that can be passed to
+// TF_GraphImportGraphDef.
+typedef struct TF_ImportGraphDefOptions TF_ImportGraphDefOptions;
+
+extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions();
+extern void TF_DeleteImportGraphDefOptions(TF_ImportGraphDefOptions* opts);
+
+// Set the prefix to be prepended to the names of nodes in `graph_def` that will
+// be imported into `graph`.
+extern void TF_ImportGraphDefOptionsSetPrefix(TF_ImportGraphDefOptions* opts,
+                                              const char* prefix);
+
+// Import the graph serialized in `graph_def` into `graph`.
+extern void TF_GraphImportGraphDef(TF_Graph* graph, const TF_Buffer* graph_def,
+                                   const TF_ImportGraphDefOptions* options,
+                                   TF_Status* status);
+
+// Note: The following function may fail on very large protos in the future.
+
 extern void TF_OperationToNodeDef(TF_Operation* oper,
                                   TF_Buffer* output_node_def,
                                   TF_Status* status);
 
-// TODO(cwhipkey): Query shape for operation outputs.
-
-// TODO(ashankar): Import GraphDef into TF_Graph.
-
 // TODO(andydavis): Function to add gradients to a graph.
 
 // TODO(josh11b): Register OpDef, available to all operations added
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 2a49cbd5abf..d85976d2d15 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -45,7 +45,7 @@ TF_Tensor* TF_Tensor_EncodeStrings(const Tensor& src);
 
 namespace {
 
-TEST(CApi, Status) {
+TEST(CAPI, Status) {
   TF_Status* s = TF_NewStatus();
   EXPECT_EQ(TF_OK, TF_GetCode(s));
   EXPECT_EQ(string(), TF_Message(s));
@@ -60,7 +60,7 @@ static void Deallocator(void* data, size_t, void* arg) {
   *reinterpret_cast<bool*>(arg) = true;
 }
 
-TEST(CApi, Tensor) {
+TEST(CAPI, Tensor) {
   const int num_bytes = 6 * sizeof(float);
   float* values =
       reinterpret_cast<float*>(tensorflow::cpu_allocator()->AllocateRaw(
@@ -80,7 +80,7 @@ TEST(CApi, Tensor) {
   EXPECT_TRUE(deallocator_called);
 }
 
-TEST(CApi, AllocateTensor) {
+TEST(CAPI, AllocateTensor) {
   const int num_bytes = 6 * sizeof(float);
   int64_t dims[] = {2, 3};
   TF_Tensor* t = TF_AllocateTensor(TF_FLOAT, dims, 2, num_bytes);
@@ -92,7 +92,7 @@ TEST(CApi, AllocateTensor) {
   TF_DeleteTensor(t);
 }
 
-TEST(CApi, LibraryLoadFunctions) {
+TEST(CAPI, LibraryLoadFunctions) {
   // Load the library.
   TF_Status* status = TF_NewStatus();
   TF_Library* lib =
@@ -139,7 +139,7 @@ static void TestEncodeDecode(int line, const std::vector<string>& data) {
   }
 }
 
-TEST(CApi, TensorEncodeDecodeStrings) {
+TEST(CAPI, TensorEncodeDecodeStrings) {
   TestEncodeDecode(__LINE__, {});
   TestEncodeDecode(__LINE__, {"hello"});
   TestEncodeDecode(__LINE__,
@@ -149,12 +149,12 @@ TEST(CApi, TensorEncodeDecodeStrings) {
   TestEncodeDecode(__LINE__, {"small", big, "small2"});
 }
 
-TEST(CApi, SessionOptions) {
+TEST(CAPI, SessionOptions) {
   TF_SessionOptions* opt = TF_NewSessionOptions();
   TF_DeleteSessionOptions(opt);
 }
 
-TEST(CApi, SessionWithRunMetadata) {
+TEST(CAPI, SessionWithRunMetadata) {
   TF_Status* s = TF_NewStatus();
   TF_SessionOptions* opt = TF_NewSessionOptions();
   TF_Session* session = TF_NewSession(opt, s);
@@ -230,7 +230,7 @@ TEST(CAPI, StatusEnum) {
   EXPECT_EQ(TF_DATA_LOSS, static_cast<TF_Code>(tensorflow::error::DATA_LOSS));
 }
 
-TEST(CApi, GetAllOpList) {
+TEST(CAPI, GetAllOpList) {
   TF_Buffer* buf = TF_GetAllOpList();
   tensorflow::OpList op_list;
   EXPECT_TRUE(op_list.ParseFromArray(buf->data, buf->length));
@@ -646,6 +646,47 @@ TEST(CAPI, Graph) {
   TF_DeleteStatus(s);
 }
 
+TEST(CAPI, ImportGraphDef) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // Create a graph with two nodes: x and 3
+  Placeholder(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_TRUE(TF_GraphOperationByName(graph, "feed") != nullptr);
+  ScalarConst(3, graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_TRUE(TF_GraphOperationByName(graph, "scalar") != nullptr);
+
+  // Export to a GraphDef
+  TF_Buffer* graph_def = TF_NewBuffer();
+  TF_GraphToGraphDef(graph, graph_def, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Import it again, with a prefix, in a fresh graph.
+  TF_DeleteGraph(graph);
+  graph = TF_NewGraph();
+  TF_ImportGraphDefOptions* opts = TF_NewImportGraphDefOptions();
+  TF_ImportGraphDefOptionsSetPrefix(opts, "imported");
+  TF_GraphImportGraphDef(graph, graph_def, opts, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  TF_DeleteImportGraphDefOptions(opts);
+  TF_DeleteBuffer(graph_def);
+
+  TF_Operation* scalar = TF_GraphOperationByName(graph, "imported/scalar");
+  TF_Operation* feed = TF_GraphOperationByName(graph, "imported/feed");
+  ASSERT_TRUE(scalar != nullptr);
+  ASSERT_TRUE(feed != nullptr);
+
+  // Can add nodes to the imported graph without trouble.
+  Add(feed, scalar, graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
 class CSessionWithGraph {
  public:
   CSessionWithGraph(TF_Graph* graph, TF_Status* s) {
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 201dc431270..26362ffdc6e 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -48,6 +48,42 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "gradient_checker",
+    srcs = ["framework/gradient_checker.cc"],
+    hdrs = ["framework/gradient_checker.h"],
+    deps = [
+        ":cc_ops",
+        ":client_session",
+        ":grad_op_registry",
+        ":gradients",
+        ":ops",
+        ":scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "framework_gradient_checker_test",
+    srcs = ["framework/gradient_checker_test.cc"],
+    deps = [
+        ":cc_ops",
+        ":grad_op_registry",
+        ":grad_ops",
+        ":gradient_checker",
+        ":testutil",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "grad_ops",
     deps = [
diff --git a/tensorflow/cc/framework/gradient_checker.cc b/tensorflow/cc/framework/gradient_checker.cc
new file mode 100644
index 00000000000..a85035896cc
--- /dev/null
+++ b/tensorflow/cc/framework/gradient_checker.cc
@@ -0,0 +1,165 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/gradient_checker.h"
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/framework/gradients.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+using namespace ops;  // NOLINT(build/namespaces)
+
+namespace {
+
+// TODO(andydavis) Support returning relative error (as opposed to max error)
+// between theoretical and numerical jacobians:
+//   fabs(jac_t - jac_n) / max(fabs(jac_t), fabs(jac_n))
+
+// TODO(andydavis) Vectorize and/or multi-thread Jacobian computations if
+// performance becomes an issue.
+
+template <typename T>
+Status ComputeTheoreticalJacobianTranspose(
+    const Scope& scope, const ops::Output& x, const TensorShape& x_shape,
+    const Tensor& x_data, const ops::Output& y, const TensorShape& y_shape,
+    Tensor* jacobian_t) {
+  // Call AddSymbolicGradients to get 'dx' (we will feed 'dy').
+  auto dy = Cast(scope, Const(scope, 1.0, y_shape), x.type());
+  std::vector<ops::Output> outputs;
+  TF_RETURN_IF_ERROR(AddSymbolicGradients(scope, {y}, {x}, {dy}, &outputs));
+  auto dx = outputs[0];
+
+  // Initialize 'dy_data' to zeros.
+  Tensor dy_data(y.type(), y_shape);
+  auto dy_data_flat = dy_data.flat<T>();
+  dy_data_flat.setZero();
+
+  // Compute the theoretical Jacobian one row at a time by backproping '1.0'
+  // for each element of 'dy', while holding all other elements of 'dy' at zero.
+  ClientSession session(scope);
+  std::vector<Tensor> dxout;
+  const int64 x_size = x_shape.num_elements();
+  const int64 dy_size = y_shape.num_elements();
+  auto jacobian = jacobian_t->matrix<T>();
+  for (int c = 0; c < dy_size; ++c) {
+    dy_data_flat(c) = 1.0;
+
+    TF_RETURN_IF_ERROR(session.Run({{x, x_data}, {dy, dy_data}}, {dx}, &dxout));
+
+    auto dx_flat = dxout[0].flat<T>();
+    for (int r = 0; r < x_size; ++r) {
+      jacobian(r, c) = dx_flat(r);
+    }
+
+    dy_data_flat(c) = 0.0;
+  }
+  return Status::OK();
+}
+
+template <typename T>
+Status ComputeNumericJacobianTranspose(const Scope& scope, const ops::Output& x,
+                                       const TensorShape& x_shape,
+                                       const ops::Output& y,
+                                       const TensorShape& y_shape,
+                                       const T delta, Tensor* x_data,
+                                       Tensor* jacobian_t) {
+  const int64 x_size = x_shape.num_elements();
+  const int64 y_size = y_shape.num_elements();
+  auto x_data_flat = x_data->flat<T>();
+
+  // Compute the numeric Jacobian one column at a time by perturbing each
+  // element of 'x_data' (positively and negatively) by 'delta', and
+  // updating the jacobian with the centered difference.
+  ClientSession session(scope);
+  std::vector<Tensor> yout;
+  auto jacobian = jacobian_t->matrix<T>();
+  for (int r = 0; r < x_size; ++r) {
+    // Store current value of 'x' at 'r'.
+    T v = x_data_flat(r);
+    // Evaluate at positive delta.
+    x_data_flat(r) = v + delta;
+    TF_RETURN_IF_ERROR(session.Run({{x, *x_data}}, {y}, &yout));
+    Tensor y_pos = yout[0];
+    // Evaluate at negative delta.
+    x_data_flat(r) = v - delta;
+    TF_RETURN_IF_ERROR(session.Run({{x, *x_data}}, {y}, &yout));
+    Tensor y_neg = yout[0];
+    // Compute element-wise centered difference and store in Jacobian.
+    auto y_pos_flat = y_pos.flat<T>();
+    auto y_neg_flat = y_neg.flat<T>();
+    const T scale = 2 * delta;
+    for (int c = 0; c < y_size; ++c) {
+      jacobian(r, c) = (y_pos_flat(c) - y_neg_flat(c)) / scale;
+    }
+    // Restore pre-perturbation value.
+    x_data_flat(r) = v;
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+template <typename T>
+Status ComputeGradientError(const Scope& scope, const ops::Output& x,
+                            const TensorShape& x_shape, const ops::Output& y,
+                            const TensorShape& y_shape, T* max_error) {
+  const int64 x_size = x_shape.num_elements();
+  const int64 y_size = y_shape.num_elements();
+
+  // Initialize 'x_data' to random values.
+  Tensor x_data(x.type(), x_shape);
+  auto x_data_flat = x_data.flat<T>();
+  x_data_flat.setRandom();
+
+  // Initialize theoretical Jacobian to zeros.
+  Tensor jacobian_t(x.type(), {x_size, y_size});
+  auto jacobian_t_flat = jacobian_t.flat<T>();
+  jacobian_t_flat.setZero();
+
+  // Compute theoretical Jacobian.
+  TF_RETURN_IF_ERROR(ComputeTheoreticalJacobianTranspose<T>(
+      scope, x, x_shape, x_data, y, y_shape, &jacobian_t));
+
+  // Inititalize numeric Jacobian to zeros.
+  Tensor jacobian_n(x.type(), {x_size, y_size});
+  auto jacobian_n_flat = jacobian_n.flat<T>();
+  jacobian_n_flat.setZero();
+
+  // Compute numeric Jacobian.
+  TF_RETURN_IF_ERROR(ComputeNumericJacobianTranspose<T>(
+      scope, x, x_shape, y, y_shape, 1e-3, &x_data, &jacobian_n));
+
+  // Compute the maximum error between theoretical and numeric Jacobians.
+  *max_error = 0.0;
+  auto jac_t = jacobian_t.matrix<T>();
+  auto jac_n = jacobian_n.matrix<T>();
+  for (int r = 0; r < x_size; ++r) {
+    for (int c = 0; c < y_size; ++c) {
+      *max_error = std::max(*max_error, std::fabs(jac_t(r, c) - jac_n(r, c)));
+    }
+  }
+  return Status::OK();
+}
+
+#define INSTANTIATE_GRAD_ERR_TYPE(T)                                        \
+  template Status ComputeGradientError<T>(                                  \
+      const Scope& scope, const ops::Output& x, const TensorShape& x_shape, \
+      const ops::Output& y, const TensorShape& y_shape, T* max_error)
+
+INSTANTIATE_GRAD_ERR_TYPE(float);
+INSTANTIATE_GRAD_ERR_TYPE(double);
+
+}  // namespace tensorflow
diff --git a/tensorflow/cc/framework/gradient_checker.h b/tensorflow/cc/framework/gradient_checker.h
new file mode 100644
index 00000000000..57e2154b68a
--- /dev/null
+++ b/tensorflow/cc/framework/gradient_checker.h
@@ -0,0 +1,35 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
+#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// Returns in 'max_error' the maximum element-wise error for dy/dx between the
+// computed and numeric Jacobian matrices where 'x' and 'y' are tensors.
+// This function adds operations to the graph associated with 'scope'.
+template <typename T>
+Status ComputeGradientError(const Scope& scope, const ops::Output& x,
+                            const TensorShape& x_shape, const ops::Output& y,
+                            const TensorShape& y_shape, T* max_error);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
diff --git a/tensorflow/cc/framework/gradient_checker_test.cc b/tensorflow/cc/framework/gradient_checker_test.cc
new file mode 100644
index 00000000000..27ec4fb4bdb
--- /dev/null
+++ b/tensorflow/cc/framework/gradient_checker_test.cc
@@ -0,0 +1,70 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/gradient_checker.h"
+#include "tensorflow/cc/framework/grad_op_registry.h"
+#include "tensorflow/cc/framework/testutil.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/equal_graph_def.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+using namespace ops;  // NOLINT(build/namespaces)
+
+namespace {
+
+TEST(GradientCheckerTest, BasicFloat) {
+  Scope scope = Scope::NewRootScope();
+  TensorShape shape({2, 4, 3});
+  auto x = Placeholder(scope, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Square(scope, x);
+  float max_error;
+  TF_ASSERT_OK(
+      ComputeGradientError<float>(scope, x, shape, y, shape, &max_error));
+  EXPECT_LT(max_error, 1e-4);
+}
+
+TEST(GradientCheckerTest, BasicDouble) {
+  Scope scope = Scope::NewRootScope();
+  TensorShape shape({2, 4, 3});
+  auto x = Placeholder(scope, DT_DOUBLE, Placeholder::Shape(shape));
+  auto y = Square(scope, x);
+  double max_error;
+  TF_ASSERT_OK(
+      ComputeGradientError<double>(scope, x, shape, y, shape, &max_error));
+  EXPECT_LT(max_error, 1e-10);
+}
+
+TEST(GradientCheckerTest, MatMulGrad) {
+  Scope scope = Scope::NewRootScope();
+
+  TensorShape x_shape({4, 3});
+  TensorShape y_shape({3, 2});
+  TensorShape z_shape({4, 2});
+
+  auto x = Placeholder(scope, DT_DOUBLE, Placeholder::Shape(x_shape));
+  auto y = Const(scope, {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, y_shape);
+  auto z = MatMul(scope, x, y);
+  double max_error;
+  TF_ASSERT_OK(
+      ComputeGradientError<double>(scope, x, x_shape, z, z_shape, &max_error));
+  EXPECT_LT(max_error, 1e-10);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
new file mode 100644
index 00000000000..8d190a40af8
--- /dev/null
+++ b/tensorflow/cc/saved_model/BUILD
@@ -0,0 +1,68 @@
+# Description:
+# TensorFlow SavedModel.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "constants",
+    hdrs = ["constants.h"],
+)
+
+cc_library(
+    name = "loader",
+    srcs = ["loader.cc"],
+    hdrs = ["loader.h"],
+    deps = [
+        ":constants",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+tf_cc_test(
+    name = "loader_test",
+    srcs = ["loader_test.cc"],
+    data = [
+        ":saved_model_half_plus_two",
+    ],
+    linkstatic = 1,
+    deps = [
+        ":constants",
+        ":loader",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+filegroup(
+    name = "saved_model_half_plus_two",
+    srcs = glob(["testdata/half_plus_two/*"]),
+)
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
new file mode 100644
index 00000000000..9ac77aee7c8
--- /dev/null
+++ b/tensorflow/cc/saved_model/constants.h
@@ -0,0 +1,36 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
+#define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
+
+namespace tensorflow {
+
+// SavedModel proto filename.
+constexpr char kSavedModelFilenamePb[] = "saved_model.pb";
+
+// SavedModel text format proto filename.
+constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
+
+// SavedModel variables filename.
+constexpr char kSavedModelVariablesFilename[] = "saved_model_variables";
+
+// Commonly used tags.
+constexpr char kSavedModelTagServe[] = "serve";
+constexpr char kSavedModelTagTrain[] = "train";
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
new file mode 100644
index 00000000000..f44f45b0bb9
--- /dev/null
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -0,0 +1,126 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/saved_model/loader.h"
+
+#include <unordered_set>
+
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+Status ReadSavedModel(const string& export_dir, SavedModel* saved_model_proto) {
+  const string saved_model_path =
+      io::JoinPath(export_dir, kSavedModelFilenamePb);
+  return ReadBinaryProto(Env::Default(), saved_model_path, saved_model_proto);
+}
+
+Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
+                              const std::unordered_set<string>& tags,
+                              MetaGraphDef* meta_graph_def_to_load) {
+  for (const MetaGraphDef& meta_graph_def : saved_model_proto.meta_graphs()) {
+    // Get tags from the meta_graph_def.
+    std::unordered_set<string> graph_tags;
+    for (const string& tag : meta_graph_def.meta_info_def().tags()) {
+      graph_tags.insert(tag);
+    }
+    // Match with the set of tags provided.
+    if (graph_tags == tags) {
+      *meta_graph_def_to_load = meta_graph_def;
+      return Status::OK();
+    }
+  }
+  return Status(error::Code::NOT_FOUND,
+                "Could not find meta graph def matching supplied tags.");
+}
+
+Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
+                                const SessionOptions& session_options,
+                                std::unique_ptr<Session>* session) {
+  session->reset(NewSession(session_options));
+  return (*session)->Create(meta_graph_def.graph_def());
+}
+
+Status Restore(const RunOptions& run_options, const string& export_dir,
+               const StringPiece restore_op_name,
+               const StringPiece variable_filename_const_op_name,
+               Session* session) {
+  const string variables_path =
+      io::JoinPath(export_dir, kSavedModelVariablesFilename);
+  if (!Env::Default()->FileExists(variables_path)) {
+    return Status(error::Code::NOT_FOUND,
+                  "Could not find checkpointed variables.");
+  }
+
+  // Add variables to the graph.
+  Tensor variables_path_tensor(DT_STRING, TensorShape({}));
+  variables_path_tensor.scalar<string>()() = variables_path;
+
+  std::vector<std::pair<string, Tensor>> inputs = {
+      {variable_filename_const_op_name.ToString(), variables_path_tensor}};
+
+  RunMetadata run_metadata;
+  return session->Run(run_options, inputs, {}, {restore_op_name.ToString()},
+                      nullptr /* outputs */, &run_metadata);
+}
+
+}  // namespace
+
+Status LoadSavedModel(const string& export_dir,
+                      const std::unordered_set<string>& tags,
+                      const SessionOptions& session_options,
+                      const RunOptions& run_options,
+                      SavedModelBundle* const bundle) {
+  if (!MaybeSavedModelDirectory(export_dir)) {
+    return Status(error::Code::NOT_FOUND,
+                  "SavedModel not found in export directory: " + export_dir);
+  }
+  LOG(INFO) << "Loading SavedModel from: " << export_dir;
+
+  SavedModel saved_model_proto;
+  TF_RETURN_IF_ERROR(ReadSavedModel(export_dir, &saved_model_proto));
+
+  TF_RETURN_IF_ERROR(
+      FindMetaGraphDefToLoad(saved_model_proto, tags, &bundle->meta_graph_def));
+
+  TF_RETURN_IF_ERROR(LoadMetaGraphIntoSession(
+      bundle->meta_graph_def, session_options, &bundle->session));
+
+  TF_RETURN_IF_ERROR(
+      Restore(run_options, export_dir,
+              bundle->meta_graph_def.saver_def().restore_op_name(),
+              bundle->meta_graph_def.saver_def().filename_tensor_name(),
+              bundle->session.get()));
+
+  LOG(INFO) << "Done loading SavedModel.";
+  return Status::OK();
+}
+
+bool MaybeSavedModelDirectory(const string& export_dir) {
+  const string saved_model_pb_path =
+      io::JoinPath(export_dir, kSavedModelFilenamePb);
+  const string saved_model_pbtxt_path =
+      io::JoinPath(export_dir, kSavedModelFilenamePbTxt);
+  return Env::Default()->FileExists(saved_model_pb_path) ||
+         Env::Default()->FileExists(saved_model_pbtxt_path);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/loader.h b/tensorflow/cc/saved_model/loader.h
new file mode 100644
index 00000000000..43f5077bd83
--- /dev/null
+++ b/tensorflow/cc/saved_model/loader.h
@@ -0,0 +1,55 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// SavedModel loading functions and SavedModelBundle struct.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
+#define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
+
+#include <string>
+#include <unordered_set>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+
+// SavedModel representation once the SavedModel is loaded from storage.
+struct SavedModelBundle {
+  std::unique_ptr<Session> session;
+  MetaGraphDef meta_graph_def;
+};
+
+// Loads a SavedModel from the specified export directory. The meta graph def to
+// be loaded is identified by the supplied tags, corresponding exactly to the
+// set of tags used at SavedModel build time. Returns a SavedModel bundle with a
+// session and the requested meta graph def, if found.
+Status LoadSavedModel(const string& export_dir,
+                      const std::unordered_set<string>& tags,
+                      const SessionOptions& session_options,
+                      const RunOptions& run_options,
+                      SavedModelBundle* const bundle);
+
+// Checks whether the provided directory could contain a SavedModel. Note that
+// the method does not load any data by itself. If the method returns `false`,
+// the export directory definitely does not contain a SavedModel. If the method
+// returns `true`, the export directory may contain a SavedModel but provides no
+// guarantee that it can be loaded.
+bool MaybeSavedModelDirectory(const string& export_dir);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
new file mode 100644
index 00000000000..84ebe03f26d
--- /dev/null
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/saved_model/loader.h"
+
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr char kTestData[] = "cc/saved_model/testdata/half_plus_two";
+
+class LoaderTest : public ::testing::Test {
+ protected:
+  LoaderTest() {}
+
+  void CheckSavedModelBundle(const SavedModelBundle& bundle) {
+    // Validate the half plus two behavior.
+    Tensor input = test::AsTensor<float>({0, 1, 2, 3}, TensorShape({4, 1}));
+
+    // Retrieve the regression signature from meta graph def.
+    const auto signature_def_map = bundle.meta_graph_def.signature_def();
+    const auto signature_def = signature_def_map.at("regression");
+
+    const string input_name = signature_def.inputs().at("input").name();
+    const string output_name = signature_def.outputs().at("output").name();
+
+    std::vector<Tensor> outputs;
+    TF_ASSERT_OK(bundle.session->Run({{input_name, input}}, {output_name}, {},
+                                     &outputs));
+    ASSERT_EQ(outputs.size(), 1);
+    test::ExpectTensorEqual<float>(
+        outputs[0],
+        test::AsTensor<float>({2, 2.5, 3, 3.5}, TensorShape({4, 1})));
+  }
+};
+
+TEST_F(LoaderTest, TagMatch) {
+  SavedModelBundle bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestData);
+  TF_ASSERT_OK(LoadSavedModel(export_dir, {kSavedModelTagServe},
+                              session_options, run_options, &bundle));
+  CheckSavedModelBundle(bundle);
+}
+
+TEST_F(LoaderTest, NoTagMatch) {
+  SavedModelBundle bundle;
+  RunOptions run_options;
+  SessionOptions session_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestData);
+  Status st = LoadSavedModel(export_dir, {"missing-tag"}, session_options,
+                             run_options, &bundle);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(
+      StringPiece(st.error_message())
+          .contains("Could not find meta graph def matching supplied tags."))
+      << st.error_message();
+}
+
+TEST_F(LoaderTest, NoTagMatchMultiple) {
+  SavedModelBundle bundle;
+  RunOptions run_options;
+  SessionOptions session_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestData);
+  Status st = LoadSavedModel(export_dir, {kSavedModelTagServe, "missing-tag"},
+                             session_options, run_options, &bundle);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(
+      StringPiece(st.error_message())
+          .contains("Could not find meta graph def matching supplied tags."))
+      << st.error_message();
+}
+
+TEST_F(LoaderTest, InvalidExportPath) {
+  SavedModelBundle bundle;
+  RunOptions run_options;
+  SessionOptions session_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), "missing-path");
+  Status st = LoadSavedModel(export_dir, {kSavedModelTagServe}, session_options,
+                             run_options, &bundle);
+  EXPECT_FALSE(st.ok());
+}
+
+TEST_F(LoaderTest, MaybeSavedModelDirectory) {
+  // Valid SavedModel directory.
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestData);
+  EXPECT_TRUE(MaybeSavedModelDirectory(export_dir));
+
+  // Directory that does not exist.
+  const string missing_export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), "missing-path");
+  EXPECT_FALSE(MaybeSavedModelDirectory(missing_export_dir));
+
+  // Directory that exists but is an invalid SavedModel location.
+  const string invalid_export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), "cc/saved_model/testdata");
+  EXPECT_FALSE(MaybeSavedModelDirectory(invalid_export_dir));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two/checkpoint b/tensorflow/cc/saved_model/testdata/half_plus_two/checkpoint
new file mode 100644
index 00000000000..1325869eb7c
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/half_plus_two/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "/tmp/saved_model/half_plus_two/saved_model_variables"
+all_model_checkpoint_paths: "/tmp/saved_model/half_plus_two/saved_model_variables"
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two/saved_model.pb b/tensorflow/cc/saved_model/testdata/half_plus_two/saved_model.pb
new file mode 100644
index 00000000000..785374c4216
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two/saved_model.pb differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two/saved_model_variables b/tensorflow/cc/saved_model/testdata/half_plus_two/saved_model_variables
new file mode 100644
index 00000000000..e1ac9e900e8
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two/saved_model_variables differ
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index b3919acb302..05ed8465888 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -15,6 +15,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/bayesflow:bayesflow_py",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
+        "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/factorization:factorization_py",
@@ -35,6 +36,7 @@ py_library(
         "//tensorflow/contrib/slim:nets",
         "//tensorflow/contrib/tensor_forest:tensor_forest_py",
         "//tensorflow/contrib/tensor_forest/hybrid:ops_lib",
+        "//tensorflow/contrib/tensorboard",
         "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/contrib/util:util_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index a02c444b075..647c466d930 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import bayesflow
 from tensorflow.contrib import copy_graph
+from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
 from tensorflow.contrib import distributions
 from tensorflow.contrib import factorization
@@ -38,6 +39,7 @@ from tensorflow.contrib import quantization
 from tensorflow.contrib import rnn
 from tensorflow.contrib import slim
 from tensorflow.contrib import tensor_forest
+from tensorflow.contrib import tensorboard
 from tensorflow.contrib import testing
 from tensorflow.contrib import training
 from tensorflow.contrib import util
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
index ce58f82efa2..b9815b8cf19 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
@@ -55,7 +55,7 @@ Log E_q[ f(Z) p(Z) / q(Z) ]
 C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].
 ```
 
-The maximum value of the exponentiated term will be 0.0, and the the expecation
+The maximum value of the exponentiated term will be 0.0, and the the expectation
 can be evaluated in a stable manner.
 
 ## Ops
@@ -252,9 +252,7 @@ def expectation(f, p, z=None, n=None, seed=None, name='expectation'):
   User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
 
   Args:
-    f: Callable mapping samples from `sampling_dist_q` to `Tensors` with
-      shape broadcastable to `q.batch_shape`.
-      For example, `f` works "just like" `sampling_dist_q.log_prob`.
+    f: Callable mapping samples from `p` to `Tensors`.
     p:  `tf.contrib.distributions.BaseDistribution`.
     z:  `Tensor` of samples from `p`, produced by `p.sample_n`.
     n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
@@ -262,7 +260,36 @@ def expectation(f, p, z=None, n=None, seed=None, name='expectation'):
     name:  A name to give this `Op`.
 
   Returns:
-    A `Tensor` with same `dtype` as `p`, and shape equal to `p.batch_shape`.
+    A `Tensor` with the same `dtype` as `p`.
+
+  Example:
+
+  ```python
+  N_samples = 10000
+
+  distributions = tf.contrib.distributions
+
+  dist = distributions.Uniform([0.0, 0.0], [1.0, 2.0])
+  elementwise_mean = lambda x: x
+  mean_sum = lambda x: tf.reduce_sum(x, 1)
+
+  estimate_elementwise_mean_tf = monte_carlo.expectation(elementwise_mean,
+                                                         dist,
+                                                         n=N_samples)
+  estimate_mean_sum_tf = monte_carlo.expectation(mean_sum,
+                                                 dist,
+                                                 n=N_samples)
+
+  with tf.Session() as sess:
+    estimate_elementwise_mean, estimate_mean_sum = (
+        sess.run([estimate_elementwise_mean_tf, estimate_mean_sum_tf]))
+  print estimate_elementwise_mean
+  >>> np.array([ 0.50018013  1.00097895], dtype=np.float32)
+  print estimate_mean_sum
+  >>> 1.49571
+
+  ```
+
   """
   with ops.name_scope(name, values=[n, z]):
     z = _get_samples(p, z, n, seed)
diff --git a/tensorflow/contrib/cmake/setup.py b/tensorflow/contrib/cmake/setup.py
index b8a30adc9d9..574fa57b7ed 100644
--- a/tensorflow/contrib/cmake/setup.py
+++ b/tensorflow/contrib/cmake/setup.py
@@ -19,27 +19,19 @@ from __future__ import print_function
 
 import fnmatch
 import os
-import platform
 import re
 import sys
 
-from setuptools import find_packages, setup, Command, Extension
+from setuptools import find_packages, setup, Command
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
 _VERSION = '0.10.0-cmake-experimental'
 
-numpy_version = "1.8.2"
-if platform.system() == "Darwin":
-  # There are bugs with numpy pip installation on OS X prior to
-  # 1.10.1, so on mac we require a higher version than on other
-  # platforms.
-  numpy_version = "1.10.1"
-
 REQUIRED_PACKAGES = [
-    'numpy >= %s' % numpy_version,
+    'numpy >= 1.11.0',
     'six >= 1.10.0',
-    'protobuf == 3.0.0b2',
+    'protobuf == 3.0.0',
 ]
 
 # python3 requires wheel 0.26
diff --git a/tensorflow/contrib/crf/BUILD b/tensorflow/contrib/crf/BUILD
new file mode 100644
index 00000000000..33c1323b481
--- /dev/null
+++ b/tensorflow/contrib/crf/BUILD
@@ -0,0 +1,40 @@
+# Description:
+#   Contains classes to construct a CRF layer
+#   APIs here are meant to evolve over time.
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+py_library(
+    name = "crf_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs_version = "PY2AND3",
+)
+
+cuda_py_tests(
+    name = "crf_test",
+    srcs = ["python/kernel_tests/crf_test.py"],
+    additional_deps = [
+        ":crf_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/crf/README.md b/tensorflow/contrib/crf/README.md
new file mode 100644
index 00000000000..68d1101ecd7
--- /dev/null
+++ b/tensorflow/contrib/crf/README.md
@@ -0,0 +1,76 @@
+# CRF
+
+The CRF module implements a linear-chain CRF layer for learning to predict tag sequences. This variant of the CRF is factored into unary potentials for every element in the sequence and binary potentials for every transition between output tags.
+
+### Usage
+
+Below is an example of the API, which learns a CRF for some random data. The linear layer in the example can be replaced by any neural network.
+
+
+```python
+import numpy as np
+import tensorflow as tf
+
+# Data settings.
+num_examples = 10
+num_words = 20
+num_features = 100
+num_tags = 5
+
+# Random features.
+x = np.random.rand(num_examples, num_words, num_features).astype(np.float32)
+
+# Random tag indices representing the gold sequence.
+y = np.random.randint(num_tags, size=[num_examples, num_words]).astype(np.int32)
+
+# All sequences in this example have the same length, but they can be variable in a real model.
+sequence_lengths = np.full(num_examples, num_words - 1, dtype=np.int32)
+
+# Train and evaluate the model.
+with tf.Graph().as_default():
+  with tf.Session() as session:
+    # Add the data to the TensorFlow graph.
+    x_t = tf.constant(x)
+    y_t = tf.constant(y)
+    sequence_lengths_t = tf.constant(sequence_lengths)
+
+    # Compute unary scores from a linear layer.
+    weights = tf.get_variable("weights", [num_features, num_tags])
+    matricized_x_t = tf.reshape(x_t, [-1, num_features])
+    matricized_unary_scores = tf.batch_matmul(matricized_x_t, weights)
+    unary_scores = tf.reshape(matricized_unary_scores,
+                              [num_examples, num_words, num_tags])
+
+    # Compute the log-likelihood of the gold sequences and keep the transition
+    # params for inference at test time.
+    log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
+        unary_scores, y_t, sequence_lengths_t)
+
+    # Add a training op to tune the parameters.
+    loss = tf.reduce_mean(-log_likelihood)
+    train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
+
+    # Train for a fixed number of iterations.
+    session.run(tf.initialize_all_variables())
+    for i in range(1000):
+      tf_unary_scores, tf_transition_params, _ = session.run(
+          [unary_scores, transition_params, train_op])
+      if i % 100 == 0:
+        correct_labels = 0
+        total_labels = 0
+        for tf_unary_scores_, y_, sequence_length_ in zip(tf_unary_scores, y,
+                                                          sequence_lengths):
+          # Remove padding from the scores and tag sequence.
+          tf_unary_scores_ = tf_unary_scores_[:sequence_length_]
+          y_ = y_[:sequence_length_]
+
+          # Compute the highest scoring sequence.
+          viterbi_sequence, _ = tf.contrib.crf.viterbi_decode(
+              tf_unary_scores_, tf_transition_params)
+
+          # Evaluate word-level accuracy.
+          correct_labels += np.sum(np.equal(viterbi_sequence, y_))
+          total_labels += sequence_length_
+        accuracy = 100.0 * correct_labels / float(total_labels)
+        print("Accuracy: %.2f%%" % accuracy)
+```
diff --git a/tensorflow/contrib/crf/__init__.py b/tensorflow/contrib/crf/__init__.py
new file mode 100644
index 00000000000..195e8cd7171
--- /dev/null
+++ b/tensorflow/contrib/crf/__init__.py
@@ -0,0 +1,39 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Linear-chain CRF layer.
+
+## This package provides functions for building a linear-chain CRF layer.
+
+@@crf_sequence_score
+@@crf_log_norm
+@@crf_log_likelihood
+@@crf_unary_score
+@@crf_binary_score
+@@CrfForwardRnnCell
+@@viterbi_decode
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.crf.python.ops.crf import _lengths_to_masks
+from tensorflow.contrib.crf.python.ops.crf import crf_binary_score
+from tensorflow.contrib.crf.python.ops.crf import crf_log_likelihood
+from tensorflow.contrib.crf.python.ops.crf import crf_log_norm
+from tensorflow.contrib.crf.python.ops.crf import crf_sequence_score
+from tensorflow.contrib.crf.python.ops.crf import crf_unary_score
+from tensorflow.contrib.crf.python.ops.crf import CrfForwardRnnCell
+from tensorflow.contrib.crf.python.ops.crf import viterbi_decode
diff --git a/tensorflow/contrib/crf/python/__init__.py b/tensorflow/contrib/crf/python/__init__.py
new file mode 100644
index 00000000000..8439848dd0e
--- /dev/null
+++ b/tensorflow/contrib/crf/python/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Linear-chain CRF."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
new file mode 100644
index 00000000000..539cabe6209
--- /dev/null
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -0,0 +1,200 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CRF."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+import tensorflow as tf
+
+
+class CrfTest(tf.test.TestCase):
+
+  def testCrfSequenceScore(self):
+    inputs = np.array(
+        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
+    transition_params = np.array(
+        [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    with self.test_session() as sess:
+      sequence_score = tf.contrib.crf.crf_sequence_score(
+          inputs=tf.expand_dims(inputs, 0),
+          tag_indices=tf.expand_dims(tag_indices, 0),
+          sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+          transition_params=tf.constant(transition_params))
+      sequence_score = tf.squeeze(sequence_score, [0])
+      tf_sequence_score = sess.run(sequence_score)
+      expected_unary_score = sum(inputs[i][tag_indices[i]]
+                                 for i in range(sequence_lengths))
+      expected_binary_score = sum(
+          transition_params[tag_indices[i], tag_indices[i + 1]]
+          for i in range(sequence_lengths - 1))
+      expected_sequence_score = expected_unary_score + expected_binary_score
+      self.assertAllClose(tf_sequence_score, expected_sequence_score)
+
+  def testCrfUnaryScore(self):
+    inputs = np.array(
+        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    with self.test_session() as sess:
+      unary_score = tf.contrib.crf.crf_unary_score(
+          tag_indices=tf.expand_dims(tag_indices, 0),
+          sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+          inputs=tf.expand_dims(inputs, 0))
+      unary_score = tf.squeeze(unary_score, [0])
+      tf_unary_score = sess.run(unary_score)
+      expected_unary_score = sum(inputs[i][tag_indices[i]]
+                                 for i in range(sequence_lengths))
+      self.assertAllClose(tf_unary_score, expected_unary_score)
+
+  def testCrfBinaryScore(self):
+    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
+    transition_params = np.array(
+        [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    with self.test_session() as sess:
+      binary_score = tf.contrib.crf.crf_binary_score(
+          tag_indices=tf.expand_dims(tag_indices, 0),
+          sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+          transition_params=tf.constant(transition_params))
+      binary_score = tf.squeeze(binary_score, [0])
+      tf_binary_score = sess.run(binary_score)
+      expected_binary_score = sum(
+          transition_params[tag_indices[i], tag_indices[i + 1]]
+          for i in range(sequence_lengths - 1))
+      self.assertAllClose(tf_binary_score, expected_binary_score)
+
+  def testCrfLogNorm(self):
+    inputs = np.array(
+        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    transition_params = np.array(
+        [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+    sequence_lengths = np.array(3, dtype=np.int32)
+    with self.test_session() as sess:
+      all_sequence_scores = []
+
+      # Compare the dynamic program with brute force computation.
+      for tag_indices in itertools.product(
+          range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        all_sequence_scores.append(
+            tf.contrib.crf.crf_sequence_score(
+                inputs=tf.expand_dims(inputs, 0),
+                tag_indices=tf.expand_dims(tag_indices, 0),
+                sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+                transition_params=tf.constant(transition_params)))
+
+      brute_force_log_norm = tf.reduce_logsumexp(all_sequence_scores)
+      log_norm = tf.contrib.crf.crf_log_norm(
+          inputs=tf.expand_dims(inputs, 0),
+          sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+          transition_params=tf.constant(transition_params))
+      log_norm = tf.squeeze(log_norm, [0])
+      tf_brute_force_log_norm, tf_log_norm = sess.run(
+          [brute_force_log_norm, log_norm])
+
+      self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
+
+  def testCrfLogLikelihood(self):
+    inputs = np.array(
+        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    transition_params = np.array(
+        [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+    with self.test_session() as sess:
+      all_sequence_log_likelihoods = []
+
+      # Make sure all probabilities sum to 1.
+      for tag_indices in itertools.product(
+          range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        sequence_log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
+            inputs=tf.expand_dims(inputs, 0),
+            tag_indices=tf.expand_dims(tag_indices, 0),
+            sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+            transition_params=tf.constant(transition_params))
+        all_sequence_log_likelihoods.append(sequence_log_likelihood)
+      total_log_likelihood = tf.reduce_logsumexp(all_sequence_log_likelihoods)
+      tf_total_log_likelihood = sess.run(total_log_likelihood)
+      self.assertAllClose(tf_total_log_likelihood, 0.0)
+
+  def testLengthsToMasks(self):
+    with self.test_session() as sess:
+      sequence_lengths = [4, 1, 8, 2]
+      max_sequence_length = max(sequence_lengths)
+
+      mask = tf.contrib.crf._lengths_to_masks(sequence_lengths,
+                                              max_sequence_length)
+      tf_mask = sess.run(mask)
+      self.assertEqual(len(tf_mask), len(sequence_lengths))
+      for m, l in zip(tf_mask, sequence_lengths):
+        self.assertAllEqual(m[:l], [1] * l)
+        self.assertAllEqual(m[l:], [0] * (len(m) - l))
+
+  def testViterbiDecode(self):
+    inputs = np.array(
+        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    transition_params = np.array(
+        [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+
+    with self.test_session() as sess:
+      all_sequence_scores = []
+      all_sequences = []
+
+      # Compare the dynamic program with brute force computation.
+      for tag_indices in itertools.product(
+          range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        all_sequences.append(tag_indices)
+        sequence_score = tf.contrib.crf.crf_sequence_score(
+            inputs=tf.expand_dims(inputs, 0),
+            tag_indices=tf.expand_dims(tag_indices, 0),
+            sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+            transition_params=tf.constant(transition_params))
+        sequence_score = tf.squeeze(sequence_score, [0])
+        all_sequence_scores.append(sequence_score)
+
+      tf_all_sequence_scores = sess.run(all_sequence_scores)
+
+      expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+      expected_max_sequence = all_sequences[expected_max_sequence_index]
+      expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
+
+      actual_max_sequence, actual_max_score = tf.contrib.crf.viterbi_decode(
+          inputs[:sequence_lengths], transition_params)
+
+      self.assertAllClose(actual_max_score, expected_max_score)
+      self.assertEqual(actual_max_sequence,
+                       expected_max_sequence[:sequence_lengths])
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/crf/python/ops/__init__.py b/tensorflow/contrib/crf/python/ops/__init__.py
new file mode 100644
index 00000000000..5ab8d7ac4a9
--- /dev/null
+++ b/tensorflow/contrib/crf/python/ops/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for building a linear-chain CRF layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
new file mode 100644
index 00000000000..fbbbc2d5c10
--- /dev/null
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -0,0 +1,311 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module for constructing a linear-chain CRF.
+
+The following snippet is an example of a CRF layer on top of a batched sequence
+of unary scores (logits for every word). This example also decodes the most
+likely sequence at test time:
+
+log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
+    unary_scores, gold_tags, sequence_lengths)
+loss = tf.reduce_mean(-log_likelihood)
+train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
+
+tf_unary_scores, tf_sequence_lengths, tf_transition_params, _ = session.run(
+    [unary_scores, sequence_lengths, transition_params, train_op])
+for tf_unary_scores_, tf_sequence_length_ in zip(tf_unary_scores,
+                                                 tf_sequence_lengths):
+# Remove padding.
+tf_unary_scores_ = tf_unary_scores_[:tf_sequence_length_]
+
+# Compute the highest score and its tag sequence.
+viterbi_sequence, viterbi_score = tf.contrib.crf.viterbi_decode(
+    tf_unary_scores_, tf_transition_params)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variable_scope as vs
+
+__all__ = ["crf_sequence_score", "crf_log_norm", "crf_log_likelihood",
+           "crf_unary_score", "crf_binary_score", "CrfForwardRnnCell",
+           "viterbi_decode"]
+
+
+def _lengths_to_masks(lengths, max_length):
+  """Creates a binary matrix that can be used to mask away padding.
+
+  Args:
+    lengths: A vector of integers representing lengths.
+    max_length: An integer indicating the maximum length. All values in
+      lengths should be less than max_length.
+  Returns:
+    masks: Masks that can be used to get rid of padding.
+  """
+  tiled_ranges = array_ops.tile(
+      array_ops.expand_dims(math_ops.range(max_length), 0),
+      [array_ops.shape(lengths)[0], 1])
+  lengths = array_ops.expand_dims(lengths, 1)
+  masks = math_ops.to_float(
+      math_ops.to_int64(tiled_ranges) < math_ops.to_int64(lengths))
+  return masks
+
+
+def crf_sequence_score(inputs, tag_indices, sequence_lengths,
+                       transition_params):
+  """Computes the unnormalized score for a tag sequence.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
+        compute the unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
+  # Compute the scores of the given tag sequence.
+  unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
+  binary_scores = crf_binary_score(tag_indices, sequence_lengths,
+                                   transition_params)
+  sequence_scores = unary_scores + binary_scores
+  return sequence_scores
+
+
+def crf_log_norm(inputs, sequence_lengths, transition_params):
+  """Computes the normalization for a CRF.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    log_norm: A [batch_size] vector of normalizers for a CRF.
+  """
+  # Split up the first and rest of the inputs in preparation for the forward
+  # algorithm.
+  first_input = array_ops.slice(inputs, [0, 0, 0], [-1, 1, -1])
+  first_input = array_ops.squeeze(first_input, [1])
+  rest_of_input = array_ops.slice(inputs, [0, 1, 0], [-1, -1, -1])
+
+  # Compute the alpha values in the forward algorithm in order to get the
+  # partition function.
+  forward_cell = CrfForwardRnnCell(transition_params)
+  _, alphas = rnn.dynamic_rnn(
+      cell=forward_cell,
+      inputs=rest_of_input,
+      sequence_length=sequence_lengths - 1,
+      initial_state=first_input,
+      dtype=dtypes.float32)
+  log_norm = math_ops.reduce_logsumexp(alphas, [1])
+  return log_norm
+
+
+def crf_log_likelihood(inputs,
+                       tag_indices,
+                       sequence_lengths,
+                       transition_params=None):
+  """Computes the log-likehood of tag sequences in a CRF.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
+        compute the log-likehood.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix, if available.
+  Returns:
+    log_likelihood: A scalar containing the log-likelihood of the given sequence
+        of tag indices.
+    transition_params: A [num_tags, num_tags] transition matrix. This is either
+        provided by the caller or created in this function.
+  """
+  # Get shape information.
+  num_tags = inputs.get_shape()[2].value
+
+  # Get the transition matrix if not provided.
+  if transition_params is None:
+    transition_params = vs.get_variable("transitions", [num_tags, num_tags])
+
+  sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
+                                       transition_params)
+  log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
+
+  # Normalize the scores to get the log-likelihood.
+  log_likelihood = sequence_scores - log_norm
+  return log_likelihood, transition_params
+
+
+def crf_unary_score(tag_indices, sequence_lengths, inputs):
+  """Computes the unary scores of tag sequences.
+
+  Args:
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
+  Returns:
+    unary_scores: A [batch_size] vector of unary scores.
+  """
+  batch_size = array_ops.shape(inputs)[0]
+  max_seq_len = array_ops.shape(inputs)[1]
+  num_tags = array_ops.shape(inputs)[2]
+
+  flattened_inputs = array_ops.reshape(inputs, [-1])
+
+  offsets = array_ops.expand_dims(
+      math_ops.range(batch_size) * max_seq_len * num_tags, 1)
+  offsets += array_ops.expand_dims(math_ops.range(max_seq_len) * num_tags, 0)
+  flattened_tag_indices = array_ops.reshape(offsets + tag_indices, [-1])
+
+  unary_scores = array_ops.reshape(
+      array_ops.gather(flattened_inputs, flattened_tag_indices),
+      [batch_size, max_seq_len])
+
+  masks = _lengths_to_masks(sequence_lengths, array_ops.shape(tag_indices)[1])
+
+  unary_scores = math_ops.reduce_sum(unary_scores * masks, 1)
+  return unary_scores
+
+
+def crf_binary_score(tag_indices, sequence_lengths, transition_params):
+  """Computes the binary scores of tag sequences.
+
+  Args:
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+  Returns:
+    binary_scores: A [batch_size] vector of binary scores.
+  """
+  # Get shape information.
+  num_tags = transition_params.get_shape()[0]
+  num_transitions = array_ops.shape(tag_indices)[1] - 1
+
+  # Truncate by one on each side of the sequence to get the start and end
+  # indices of each transition.
+  start_tag_indices = array_ops.slice(tag_indices, [0, 0],
+                                      [-1, num_transitions])
+  end_tag_indices = array_ops.slice(tag_indices, [0, 1], [-1, num_transitions])
+
+  # Encode the indices in a flattened representation.
+  flattened_transition_indices = start_tag_indices * num_tags + end_tag_indices
+  flattened_transition_params = array_ops.reshape(transition_params, [-1])
+
+  # Get the binary scores based on the flattened representation.
+  binary_scores = array_ops.gather(flattened_transition_params,
+                                   flattened_transition_indices)
+
+  masks = _lengths_to_masks(sequence_lengths, array_ops.shape(tag_indices)[1])
+  truncated_masks = array_ops.slice(masks, [0, 1], [-1, -1])
+  binary_scores = math_ops.reduce_sum(binary_scores * truncated_masks, 1)
+  return binary_scores
+
+
+class CrfForwardRnnCell(rnn_cell.RNNCell):
+  """Computes the alpha values in a linear-chain CRF.
+
+  See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
+  """
+
+  def __init__(self, transition_params):
+    """Initialize the CrfForwardRnnCell.
+
+    Args:
+      transition_params: A [num_tags, num_tags] matrix of binary potentials.
+          This matrix is expanded into a [1, num_tags, num_tags] in preparation
+          for the broadcast summation occurring within the cell.
+    """
+    self._transition_params = array_ops.expand_dims(transition_params, 0)
+    self._num_tags = transition_params.get_shape()[0].value
+
+  @property
+  def state_size(self):
+    return self._num_tags
+
+  @property
+  def output_size(self):
+    return self._num_tags
+
+  def __call__(self, inputs, state, scope=None):
+    """Build the CrfForwardRnnCell.
+
+    Args:
+      inputs: A [batch_size, num_tags] matrix of unary potentials.
+      state: A [batch_size, num_tags] matrix containing the previous alpha
+          values.
+      scope: Unused variable scope of this cell.
+
+    Returns:
+      new_alphas, new_alphas: A pair of [batch_size, num_tags] matrices
+          values containing the new alpha values.
+    """
+    state = array_ops.expand_dims(state, 2)
+
+    # This addition op broadcasts self._transitions_params along the zeroth
+    # dimension and state along the second dimension. This performs the
+    # multiplication of previous alpha values and the current binary potentials
+    # in log space.
+    transition_scores = state + self._transition_params
+    new_alphas = inputs + math_ops.reduce_logsumexp(transition_scores, [1])
+
+    # Both the state and the output of this RNN cell contain the alphas values.
+    # The output value is currently unused and simply satisfies the RNN API.
+    # This could be useful in the future if we need to compute marginal
+    # probabilities, which would require the accumulated alpha values at every
+    # time step.
+    return new_alphas, new_alphas
+
+
+def viterbi_decode(score, transition_params):
+  """Decode the highest scoring sequence of tags outside of TensorFlow.
+
+  This should only be used at test time.
+
+  Args:
+    score: A [seq_len, num_tags] matrix of unary potentials.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+
+  Returns:
+    viterbi: A [seq_len] list of integers containing the highest scoring tag
+        indicies.
+    viterbi_score: A float containing the score for the viterbi sequence.
+  """
+  trellis = np.zeros_like(score)
+  backpointers = np.zeros_like(score, dtype=np.int32)
+  trellis[0] = score[0]
+
+  for t in range(1, score.shape[0]):
+    v = np.expand_dims(trellis[t - 1], 1) + transition_params
+    trellis[t] = score[t] + np.max(v, 0)
+    backpointers[t] = np.argmax(v, 0)
+
+  viterbi = [np.argmax(trellis[-1])]
+  for bp in reversed(backpointers[1:]):
+    viterbi.append(bp[viterbi[-1]])
+  viterbi.reverse()
+
+  viterbi_score = np.max(trellis[-1])
+  return viterbi, viterbi_score
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py b/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py
index ea1395eb9d2..c68447e5c62 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py
@@ -58,13 +58,6 @@ class KLTest(tf.test.TestCase):
       self.assertAllEqual([float("nan")], kl_ok.eval())
 
   def testRegistrationFailures(self):
-    with self.assertRaisesRegexp(TypeError, "is not a subclass of"):
-      tf.contrib.distributions.RegisterKL(
-          tf.contrib.distributions.Normal, object)(lambda x: x)
-    with self.assertRaisesRegexp(TypeError, "is not a subclass of"):
-      tf.contrib.distributions.RegisterKL(
-          object, tf.contrib.distributions.Normal)(lambda x: x)
-
     class MyDist(tf.contrib.distributions.Normal):
       pass
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
index 0de7744f15a..d76e06da0d1 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
-import functools
 
 import numpy as np
 import tensorflow as tf
@@ -69,9 +68,9 @@ def make_univariate_mixture(batch_shape, num_components):
   logits = tf.random_uniform(
       list(batch_shape) + [num_components], -1, 1, dtype=tf.float32) - 50.
   components = [
-      (distributions_py.Normal,
-       {"mu": np.float32(np.random.randn(*list(batch_shape))),
-        "sigma": np.float32(10 * np.random.rand(*list(batch_shape)))})
+      distributions_py.Normal(
+          mu=np.float32(np.random.randn(*list(batch_shape))),
+          sigma=np.float32(10 * np.random.rand(*list(batch_shape))))
       for _ in range(num_components)
   ]
   cat = distributions_py.Categorical(logits, dtype=tf.int32)
@@ -82,10 +81,10 @@ def make_multivariate_mixture(batch_shape, num_components, event_shape):
   logits = tf.random_uniform(
       list(batch_shape) + [num_components], -1, 1, dtype=tf.float32) - 50.
   components = [
-      (distributions_py.MultivariateNormalDiag,
-       {"mu": np.float32(np.random.randn(*list(batch_shape + event_shape))),
-        "diag_stdev": np.float32(10 * np.random.rand(
-            *list(batch_shape + event_shape)))})
+      distributions_py.MultivariateNormalDiag(
+          mu=np.float32(np.random.randn(*list(batch_shape + event_shape))),
+          diag_stdev=np.float32(10 * np.random.rand(
+              *list(batch_shape + event_shape))))
       for _ in range(num_components)
   ]
   cat = distributions_py.Categorical(logits, dtype=tf.int32)
@@ -116,7 +115,7 @@ class MixtureTest(tf.test.TestCase):
                                              r"cat.num_classes != len"):
       distributions_py.Mixture(
           distributions_py.Categorical([0.1, 0.5]),  # 2 classes
-          [(distributions_py.Normal, {"mu": 1.0, "sigma": 2.0})])
+          [distributions_py.Normal(mu=1.0, sigma=2.0)])
     with self.assertRaisesWithPredicateMatch(
         ValueError, r"\(\) and \(2,\) are not compatible"):
       # The value error is raised because the batch shapes of the
@@ -124,13 +123,13 @@ class MixtureTest(tf.test.TestCase):
       # vector of size (2,).
       distributions_py.Mixture(
           distributions_py.Categorical([-0.5, 0.5]),  # scalar batch
-          [(distributions_py.Normal, {"mu": 1.0, "sigma": 2.0}),  # scalar dist
-           (distributions_py.Normal, {"mu": [1.0, 1.0], "sigma": [2.0, 2.0]})])
+          [distributions_py.Normal(mu=1.0, sigma=2.0),  # scalar dist
+           distributions_py.Normal(mu=[1.0, 1.0], sigma=[2.0, 2.0])])
     with self.assertRaisesWithPredicateMatch(ValueError, r"Could not infer"):
       cat_logits = tf.placeholder(shape=[1, None], dtype=tf.int32)
       distributions_py.Mixture(
           distributions_py.Categorical(cat_logits),
-          [(distributions_py.Normal, {"mu": [1.0], "sigma": [2.0]})])
+          [distributions_py.Normal(mu=[1.0], sigma=[2.0])])
 
   def testBrokenShapesDynamic(self):
     with self.test_session():
@@ -138,8 +137,8 @@ class MixtureTest(tf.test.TestCase):
       d1_param = tf.placeholder(dtype=tf.float32)
       d = distributions_py.Mixture(
           distributions_py.Categorical([0.1, 0.2]),
-          [(distributions_py.Normal, {"mu": d0_param, "sigma": d0_param}),
-           (distributions_py.Normal, {"mu": d1_param, "sigma": d1_param})],
+          [distributions_py.Normal(mu=d0_param, sigma=d0_param),
+           distributions_py.Normal(mu=d1_param, sigma=d1_param)],
           validate_args=True)
       with self.assertRaisesOpError(r"batch shape must match"):
         d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: [1.0]})
@@ -150,42 +149,24 @@ class MixtureTest(tf.test.TestCase):
     with self.assertRaisesWithPredicateMatch(TypeError, "Categorical"):
       distributions_py.Mixture(None, [])
     cat = distributions_py.Categorical([0.3, 0.2])
-    # components must be a list of tuples
-    with self.assertRaisesWithPredicateMatch(TypeError, "tuples of the form"):
+    # components must be a list of distributions
+    with self.assertRaisesWithPredicateMatch(
+        TypeError, "all .* must be Distribution instances"):
       distributions_py.Mixture(cat, [None])
-    # components tuples must be size 2
-    with self.assertRaisesWithPredicateMatch(TypeError, "tuples of the form"):
-      distributions_py.Mixture(cat, [tuple()])
-    # components tuples must be size 2
-    with self.assertRaisesWithPredicateMatch(TypeError, "tuples of the form"):
-      distributions_py.Mixture(cat, [(None)])
-    # components tuples must be of the form (callable, dict)
-    with self.assertRaisesWithPredicateMatch(TypeError, "tuples of the form"):
-      distributions_py.Mixture(cat, [(None, None)])
-    # components tuples must be size 2
-    with self.assertRaisesWithPredicateMatch(TypeError, "tuples of the form"):
-      distributions_py.Mixture(cat, [(None, None, None)])
-    # components tuples must be of the form (callable, dict)
-    with self.assertRaisesWithPredicateMatch(TypeError, "tuples of the form"):
-      distributions_py.Mixture(cat, [(lambda x: x, None)])
-    # components tuples must be of the form (callable, dict)
-    with self.assertRaisesWithPredicateMatch(TypeError, "tuples of the form"):
-      distributions_py.Mixture(cat, [(None, {})])
     with self.assertRaisesWithPredicateMatch(TypeError, "same dtype"):
       distributions_py.Mixture(
           cat,
-          [(distributions_py.Normal, {"mu": [1.0], "sigma": [2.0]}),
-           (distributions_py.Normal, {"mu": [np.float16(1.0)],
-                                      "sigma": [np.float16(2.0)]})])
+          [distributions_py.Normal(mu=[1.0], sigma=[2.0]),
+           distributions_py.Normal(mu=[np.float16(1.0)],
+                                   sigma=[np.float16(2.0)])])
     with self.assertRaisesWithPredicateMatch(ValueError, "non-empty list"):
       distributions_py.Mixture(distributions_py.Categorical([0.3, 0.2]), None)
     with self.assertRaisesWithPredicateMatch(TypeError,
                                              "either be continuous or not"):
       distributions_py.Mixture(
           cat,
-          [(distributions_py.Normal, {"mu": [1.0], "sigma": [2.0]}),
-           (functools.partial(distributions_py.Bernoulli, dtype=tf.float32),
-            {"logits": [1.0]})])
+          [distributions_py.Normal(mu=[1.0], sigma=[2.0]),
+           distributions_py.Bernoulli(dtype=tf.float32, logits=[1.0])])
 
   def testMeanUnivariate(self):
     with self.test_session() as sess:
@@ -196,7 +177,7 @@ class MixtureTest(tf.test.TestCase):
         self.assertEqual(batch_shape, mean.get_shape())
 
         cat_probs = tf.nn.softmax(dist.cat.logits)
-        dist_means = [d.mean() for d in dist.distributions]
+        dist_means = [d.mean() for d in dist.components]
 
         mean_value, cat_probs_value, dist_means_value = sess.run(
             [mean, cat_probs, dist_means])
@@ -217,7 +198,7 @@ class MixtureTest(tf.test.TestCase):
         self.assertEqual(batch_shape + (4,), mean.get_shape())
 
         cat_probs = tf.nn.softmax(dist.cat.logits)
-        dist_means = [d.mean() for d in dist.distributions]
+        dist_means = [d.mean() for d in dist.components]
 
         mean_value, cat_probs_value, dist_means_value = sess.run(
             [mean, cat_probs, dist_means])
@@ -243,7 +224,7 @@ class MixtureTest(tf.test.TestCase):
 
         self.assertEqual(x.shape, p_x.get_shape())
         cat_probs = tf.nn.softmax([dist.cat.logits])[0]
-        dist_probs = [d.prob(x) for d in dist.distributions]
+        dist_probs = [d.prob(x) for d in dist.components]
 
         p_x_value, cat_probs_value, dist_probs_value = sess.run(
             [p_x, cat_probs, dist_probs])
@@ -269,7 +250,7 @@ class MixtureTest(tf.test.TestCase):
         self.assertEqual(x.shape[:-1], p_x.get_shape())
 
         cat_probs = tf.nn.softmax([dist.cat.logits])[0]
-        dist_probs = [d.prob(x) for d in dist.distributions]
+        dist_probs = [d.prob(x) for d in dist.components]
 
         p_x_value, cat_probs_value, dist_probs_value = sess.run(
             [p_x, cat_probs, dist_probs])
@@ -292,7 +273,7 @@ class MixtureTest(tf.test.TestCase):
         self.assertEqual(x.shape, p_x.get_shape())
 
         cat_probs = tf.nn.softmax(dist.cat.logits)
-        dist_probs = [d.prob(x) for d in dist.distributions]
+        dist_probs = [d.prob(x) for d in dist.components]
 
         p_x_value, cat_probs_value, dist_probs_value = sess.run(
             [p_x, cat_probs, dist_probs])
@@ -318,7 +299,7 @@ class MixtureTest(tf.test.TestCase):
         self.assertEqual(x.shape[:-1], p_x.get_shape())
 
         cat_probs = tf.nn.softmax(dist.cat.logits)
-        dist_probs = [d.prob(x) for d in dist.distributions]
+        dist_probs = [d.prob(x) for d in dist.components]
 
         p_x_value, cat_probs_value, dist_probs_value = sess.run(
             [p_x, cat_probs, dist_probs])
@@ -430,7 +411,7 @@ class MixtureTest(tf.test.TestCase):
         self.assertEqual(batch_shape, entropy_lower_bound.get_shape())
 
         cat_probs = tf.nn.softmax(dist.cat.logits)
-        dist_entropy = [d.entropy() for d in dist.distributions]
+        dist_entropy = [d.entropy() for d in dist.components]
 
         entropy_lower_bound_value, cat_probs_value, dist_entropy_value = (
             sess.run([entropy_lower_bound, cat_probs, dist_entropy]))
@@ -486,8 +467,7 @@ class MixtureBenchmark(tf.test.Benchmark):
           tf.Variable(np.random.rand(batch_size, num_features))
           for _ in range(num_components)]
       components = list(
-          (distributions_py.MultivariateNormalDiag,
-           {"mu": mu, "diag_stdev": sigma})
+          distributions_py.MultivariateNormalDiag(mu=mu, diag_stdev=sigma)
           for (mu, sigma) in zip(mus, sigmas))
       return distributions_py.Mixture(cat, components)
 
@@ -524,8 +504,7 @@ class MixtureBenchmark(tf.test.Benchmark):
               psd(np.random.rand(batch_size, num_features, num_features)))
           for _ in range(num_components)]
       components = list(
-          (distributions_py.MultivariateNormalFull,
-           {"mu": mu, "sigma": sigma})
+          distributions_py.MultivariateNormalFull(mu=mu, sigma=sigma)
           for (mu, sigma) in zip(mus, sigmas))
       return distributions_py.Mixture(cat, components)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
index 7951a9a9c01..c79cb4a2965 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
@@ -33,7 +33,7 @@ class QuantizedDistributionTest(tf.test.TestCase):
     self.assertTrue(np.isfinite(array).all())
 
   def test_quantization_of_uniform_with_cutoffs_having_no_effect(self):
-    with self.test_session():
+    with self.test_session() as sess:
       # The Quantized uniform with cutoffs == None divides the real line into:
       # R = ...(-1, 0](0, 1](1, 2](2, 3](3, 4]...
       # j = ...     0     1     2     3     4 ...
@@ -60,34 +60,38 @@ class QuantizedDistributionTest(tf.test.TestCase):
             b=3.0)
 
         # pmf
+        pmf_n1, pmf_0, pmf_1, pmf_2, pmf_3, pmf_4, pmf_5 = sess.run(
+            qdist.pmf([-1., 0., 1., 2., 3., 4., 5.]))
         # uniform had no mass below -1.
-        self.assertAllClose(0., qdist.pmf(-1.).eval())
+        self.assertAllClose(0., pmf_n1)
         # uniform had no mass below 0.
-        self.assertAllClose(0., qdist.pmf(0.).eval())
+        self.assertAllClose(0., pmf_0)
         # uniform put 1/3 of its mass in each of (0, 1], (1, 2], (2, 3],
         # which are the intervals j = 1, 2, 3.
-        self.assertAllClose(1 / 3, qdist.pmf(1.).eval())
-        self.assertAllClose(1 / 3, qdist.pmf(2.).eval())
-        self.assertAllClose(1 / 3, qdist.pmf(3.).eval())
+        self.assertAllClose(1 / 3, pmf_1)
+        self.assertAllClose(1 / 3, pmf_2)
+        self.assertAllClose(1 / 3, pmf_3)
         # uniform had no mass in (3, 4] or (4, 5], which are j = 4, 5.
-        self.assertAllClose(0 / 3, qdist.pmf(4.).eval())
-        self.assertAllClose(0 / 3, qdist.pmf(5.).eval())
+        self.assertAllClose(0 / 3, pmf_4)
+        self.assertAllClose(0 / 3, pmf_5)
 
         # cdf
-        self.assertAllClose(0., qdist.cdf(-1.).eval())
-        self.assertAllClose(0., qdist.cdf(0.).eval())
-        self.assertAllClose(1 / 3, qdist.cdf(1.).eval())
-        self.assertAllClose(2 / 3, qdist.cdf(2.).eval())
+        cdf_n1, cdf_0, cdf_1, cdf_2, cdf_2p5, cdf_3, cdf_4, cdf_5 = sess.run(
+            qdist.cdf([-1., 0., 1., 2., 2.5, 3., 4., 5.]))
+        self.assertAllClose(0., cdf_n1)
+        self.assertAllClose(0., cdf_0)
+        self.assertAllClose(1 / 3, cdf_1)
+        self.assertAllClose(2 / 3, cdf_2)
         # Note fractional values allowed for cdfs of discrete distributions.
         # And adding 0.5 makes no difference because the quantized dist has
         # mass only on the integers, never in between.
-        self.assertAllClose(2 / 3, qdist.cdf(2.5).eval())
-        self.assertAllClose(3 / 3, qdist.cdf(3.).eval())
-        self.assertAllClose(3 / 3, qdist.cdf(4.).eval())
-        self.assertAllClose(3 / 3, qdist.cdf(5.).eval())
+        self.assertAllClose(2 / 3, cdf_2p5)
+        self.assertAllClose(3 / 3, cdf_3)
+        self.assertAllClose(3 / 3, cdf_4)
+        self.assertAllClose(3 / 3, cdf_5)
 
   def test_quantization_of_uniform_with_cutoffs_in_the_middle(self):
-    with self.test_session():
+    with self.test_session() as sess:
       # The uniform is supported on [-3, 3]
       # Consider partitions the real line in intervals
       # ...(-3, -2](-2, -1](-1, 0](0, 1](1, 2](2, 3] ...
@@ -103,25 +107,27 @@ class QuantizedDistributionTest(tf.test.TestCase):
           b=3.0)
 
       # pmf
+      cdf_n3, cdf_n2, cdf_n1, cdf_0, cdf_0p5, cdf_1, cdf_10 = sess.run(
+          qdist.cdf([-3., -2., -1., 0., 0.5, 1.0, 10.0]))
       # Uniform had no mass on (-4, -3] or (-3, -2]
-      self.assertAllClose(0., qdist.cdf(-3.).eval())
-      self.assertAllClose(0., qdist.cdf(-2.).eval())
+      self.assertAllClose(0., cdf_n3)
+      self.assertAllClose(0., cdf_n2)
       # Uniform had 1/6 of its mass in each of (-3, -2], and (-2, -1], which
       # were collapsed into (-infty, -1], which is now the "-1" interval.
-      self.assertAllClose(1 / 3, qdist.cdf(-1.).eval())
+      self.assertAllClose(1 / 3, cdf_n1)
       # The j=0 interval contained mass from (-3, 0], which is 1/2 of the
       # uniform's mass.
-      self.assertAllClose(1 / 2, qdist.cdf(0.).eval())
+      self.assertAllClose(1 / 2, cdf_0)
       # Adding 0.5 makes no difference because the quantized dist has mass on
       # the integers, not in between them.
-      self.assertAllClose(1 / 2, qdist.cdf(0.5).eval())
+      self.assertAllClose(1 / 2, cdf_0p5)
       # After applying the cutoff, all mass was either in the interval
       # (0, infty), or below.  (0, infty) is the interval indexed by j=1,
       # so pmf(1) should equal 1.
-      self.assertAllClose(1., qdist.cdf(1.0).eval())
+      self.assertAllClose(1., cdf_1)
       # Since no mass of qdist is above 1,
       # pmf(10) = P[Y <= 10] = P[Y <= 1] = pmf(1).
-      self.assertAllClose(1., qdist.cdf(10.0).eval())
+      self.assertAllClose(1., cdf_10)
 
   def test_quantization_of_batch_of_uniforms(self):
     batch_shape = (5, 5)
@@ -231,10 +237,12 @@ class QuantizedDistributionTest(tf.test.TestCase):
       # The smallest value the samples can take on is 1, which corresponds to
       # the interval (0, 1].  Recall we use ceiling in the sampling definition.
       self.assertLess(0.5, samps.min())
-      for x in range(1, 10):
+      x_vals = np.arange(1, 11).astype(np.float32)
+      pmf_vals = qdist.pmf(x_vals).eval()
+      for ii in range(10):
         self.assertAllClose(
-            qdist.pmf(float(x)).eval(),
-            (samps == x).mean(),
+            pmf_vals[ii],
+            (samps == x_vals[ii]).mean(),
             atol=std_err_bound)
 
   def test_normal_cdf_and_survival_function(self):
diff --git a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py b/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
index 3b766977633..90f2fdf95eb 100644
--- a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
+++ b/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -32,8 +31,8 @@ def kl(dist_a, dist_b, allow_nan=False, name=None):
   """Get the KL-divergence KL(dist_a || dist_b).
 
   Args:
-    dist_a: instance of distributions.Distribution.
-    dist_b: instance of distributions.Distribution.
+    dist_a: The first distribution.
+    dist_b: The second distribution.
     allow_nan: If `False` (default), a runtime error is raised
       if the KL returns NaN values for any batch entry of the given
       distributions.  If `True`, the KL may return a NaN for the given entry.
@@ -43,18 +42,9 @@ def kl(dist_a, dist_b, allow_nan=False, name=None):
     A Tensor with the batchwise KL-divergence between dist_a and dist_b.
 
   Raises:
-    TypeError: If dist_a or dist_b is not an instance of Distribution.
     NotImplementedError: If no KL method is defined for distribution types
       of dist_a and dist_b.
   """
-  if not isinstance(dist_a, distribution.Distribution):
-    raise TypeError(
-        "dist_a is not an instance of Distribution, received type: %s"
-        % type(dist_a))
-  if not isinstance(dist_b, distribution.Distribution):
-    raise TypeError(
-        "dist_b is not an instance of Distribution, received type: %s"
-        % type(dist_b))
   kl_fn = _DIVERGENCES.get((type(dist_a), type(dist_b)), None)
   if kl_fn is None:
     raise NotImplementedError(
@@ -94,16 +84,7 @@ class RegisterKL(object):
     Args:
       dist_cls_a: the class of the first argument of the KL divergence.
       dist_cls_b: the class of the second argument of the KL divergence.
-
-    Raises:
-      TypeError: if dist_cls_a or dist_cls_b are not subclasses of
-        Distribution.
     """
-
-    if not issubclass(dist_cls_a, distribution.Distribution):
-      raise TypeError("%s is not a subclass of Distribution" % dist_cls_a)
-    if not issubclass(dist_cls_b, distribution.Distribution):
-      raise TypeError("%s is not a subclass of Distribution" % dist_cls_b)
     self._key = (dist_cls_a, dist_cls_b)
 
   def __call__(self, kl_fn):
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index add31a5dd8f..05f6f0932d6 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -56,43 +56,15 @@ class Mixture(distribution.Distribution):
     all having matching dtype, batch shape, event shape, and continuity
     properties (the components).
 
-    The user does not pass the list of distributions directly, but rather a
-    list of `(constructor, batch_tensor_params_dict)` pairs,
-    called `components`. The list of distributions is created via:
-
-    ```python
-    distributions = [
-      c(**params_dict) for (c, params_dict) in zip(*components)
-    ]
-    ```
-
-    This form allows for certain types of batch-shape optimizations within
-    this class.
-
-    An example of `components`:
-
-    ```python
-    components = [
-      (tf.contrib.distributions.Normal, {"mu": 3.0, "sigma": 1.0}),
-      (functools.partial(tf.contrib.distributions.Normal, validate_args=False),
-       {"mu": 3.0, "sigma": 2.0}),
-      (tf.contrib.distributions.Normal.from_params,
-       {"mu": 1.0, "sigma": -1.0})
-    ]
-    ```
-
     The `num_classes` of `cat` must be possible to infer at graph construction
-    time and match `len(distributions)`.
+    time and match `len(components)`.
 
     Args:
       cat: A `Categorical` distribution instance, representing the probabilities
           of `distributions`.
-      components: A list or tuple of `(constructor, batch_tensor_params)`
-        tuples.  The `constructor` must be a callable, and `batch_tensor_params`
-        must be a dict mapping constructor kwargs to batchwise parameters.
-        Each `Distribution` instance created by calling
-        `constructor(**batch_tensor_params)` must have the same type, be defined
-        on the same domain, and have matching `event_shape` and `batch_shape`.
+      components: A list or tuple of `Distribution` instances.
+        Each instance must have the same type, be defined on the same domain,
+        and have matching `event_shape` and `batch_shape`.
       validate_args: `Boolean`, default `False`.  If `True`, raise a runtime
         error if batch or event ranks are inconsistent between cat and any of
         the distributions.  This is only checked if the ranks cannot be
@@ -106,16 +78,13 @@ class Mixture(distribution.Distribution):
     Raises:
       TypeError: If cat is not a `Categorical`, or `components` is not
         a list or tuple, or the elements of `components` are not
-        tuples of the form `(callable, dict)`, or the objects resulting
-        from calling `callable(**dict)` are not instances of `Distribution`, or
-        the resulting instances of `Distribution` do not have matching
-        continuity properties, or do not have matching `dtype`.
-      ValueError: If `components` is an empty list or tuple, or the
-        distributions created from `components` do have a statically known event
-        rank.  If `cat.num_classes` cannot be inferred at graph creation time,
+        instances of `Distribution`, or do not have matching `dtype`.
+      ValueError: If `components` is an empty list or tuple, or its
+        elements do not have a statically known event rank.
+        If `cat.num_classes` cannot be inferred at graph creation time,
         or the constant value of `cat.num_classes` is not equal to
-        `len(distributions)`, or all `distributions` and `cat` do not have
-        matching static batch shapes, or all components' distributions do not
+        `len(components)`, or all `components` and `cat` do not have
+        matching static batch shapes, or all components do not
         have matching static event shapes.
     """
     if not isinstance(cat, categorical.Categorical):
@@ -126,52 +95,29 @@ class Mixture(distribution.Distribution):
     if not isinstance(components, (list, tuple)):
       raise TypeError("components must be a list or tuple, but saw: %s" %
                       components)
-    if not all(isinstance(c, tuple) and len(c) == 2 and
-               callable(c[0]) and isinstance(c[1], dict)
-               for c in components):
+    if not all(isinstance(c, distribution.Distribution) for c in components):
       raise TypeError(
-          "all entries in components must be tuples of the form "
-          "(make, params), where make is callable and params is a dict,"
+          "all entries in components must be Distribution instances"
           " but saw: %s" % components)
 
-    def _make_tensors(d):
-      return dict((k, ops.convert_to_tensor(v, name="tensor_%s" % k))
-                  for (k, v) in d.items())
-
-    with ops.name_scope(name, values=[cat.logits]):
-      components_tensor_params = list((make, _make_tensors(batch_params))
-                                      for (make, batch_params) in components)
-      distributions = [make(**batch_params)
-                       for (make, batch_params) in components_tensor_params]
-
-    # Store components internally with their batch params having been
-    # converted to tensors.
-    # TODO(ebrevdo): Use self._components to optimize sampling.
-    self._components = components_tensor_params
-
-    if not all(isinstance(d, distribution.Distribution) for d in distributions):
+    dtype = components[0].dtype
+    if not all(d.dtype == dtype for d in components):
+      raise TypeError("All components must have the same dtype, but saw "
+                      "dtypes: %s" % [(d.name, d.dtype) for d in components])
+    is_continuous = components[0].is_continuous
+    if not all(d.is_continuous == is_continuous for d in components):
       raise TypeError(
-          "all entries in distributions must be instances of Distribution, "
-          "but saw: %s" % distributions)
-
-    dtype = distributions[0].dtype
-    if not all(d.dtype == dtype for d in distributions):
-      raise TypeError("All distributions must have the same dtype, but saw "
-                      "dtypes: %s" % [(d.name, d.dtype) for d in distributions])
-    is_continuous = distributions[0].is_continuous
-    if not all(d.is_continuous == is_continuous for d in distributions):
-      raise TypeError(
-          "All distributions must either be continuous or not, but continuity "
-          "values are: %s" % [(d.name, d.is_continuous) for d in distributions])
-    static_event_shape = distributions[0].get_event_shape()
+          "All components must either be continuous or not, but continuity "
+          "values are: %s" % [(d.name, d.is_continuous) for d in components])
+    static_event_shape = components[0].get_event_shape()
     static_batch_shape = cat.get_batch_shape()
-    for d in distributions:
+    for d in components:
       static_event_shape = static_event_shape.merge_with(d.get_event_shape())
       static_batch_shape = static_batch_shape.merge_with(d.get_batch_shape())
     if static_event_shape.ndims is None:
       raise ValueError(
-          "Expected to know rank(event_shape) from distributions, but "
-          "none of the distributions provide a static number of ndims")
+          "Expected to know rank(event_shape) from components, but "
+          "none of the components provide a static number of ndims")
 
     # Ensure that all batch and event ndims are consistent.
     with ops.name_scope(name, values=[cat.logits]):
@@ -180,42 +126,42 @@ class Mixture(distribution.Distribution):
       if static_num_components is None:
         raise ValueError(
             "Could not infer number of classes from cat and unable "
-            "to compare this value to the number of distributions passed in.")
+            "to compare this value to the number of components passed in.")
       # Possibly convert from numpy 0-D array.
       static_num_components = int(static_num_components)
-      if static_num_components != len(distributions):
-        raise ValueError("cat.num_classes != len(distributions): %d vs. %d" %
-                         (static_num_components, len(distributions)))
+      if static_num_components != len(components):
+        raise ValueError("cat.num_classes != len(components): %d vs. %d" %
+                         (static_num_components, len(components)))
 
       cat_batch_shape = cat.batch_shape()
       cat_batch_rank = array_ops.size(cat_batch_shape)
       if validate_args:
-        batch_shapes = [d.batch_shape() for d in distributions]
+        batch_shapes = [d.batch_shape() for d in components]
         batch_ranks = [array_ops.size(bs) for bs in batch_shapes]
-        check_message = ("distributions[%d] batch shape must match cat "
+        check_message = ("components[%d] batch shape must match cat "
                          "batch shape")
         self._assertions = [
             check_ops.assert_equal(
                 cat_batch_rank, batch_ranks[di], message=check_message % di)
-            for di in range(len(distributions))
+            for di in range(len(components))
         ]
         self._assertions += [
             check_ops.assert_equal(
                 cat_batch_shape, batch_shapes[di], message=check_message % di)
-            for di in range(len(distributions))
+            for di in range(len(components))
         ]
       else:
         self._assertions = []
 
       self._cat = cat
-      self._distributions = list(distributions)
+      self._components = list(components)
       self._num_components = static_num_components
       self._static_event_shape = static_event_shape
       self._static_batch_shape = static_batch_shape
 
       super(Mixture, self).__init__(
           dtype=dtype,
-          parameters={"cat": self._cat, "distributions": self._distributions,
+          parameters={"cat": self._cat, "components": self._components,
                       "num_components": self._num_components},
           is_reparameterized=False,
           is_continuous=is_continuous,
@@ -228,8 +174,8 @@ class Mixture(distribution.Distribution):
     return self._cat
 
   @property
-  def distributions(self):
-    return self._distributions
+  def components(self):
+    return self._components
 
   @property
   def num_components(self):
@@ -242,14 +188,14 @@ class Mixture(distribution.Distribution):
     return self._static_batch_shape
 
   def _event_shape(self):
-    return self._distributions[0].event_shape()
+    return self._components[0].event_shape()
 
   def _get_event_shape(self):
     return self._static_event_shape
 
   def _mean(self):
     with ops.control_dependencies(self._assertions):
-      distribution_means = [d.mean() for d in self.distributions]
+      distribution_means = [d.mean() for d in self.components]
       cat_probs = self._cat_probs(log_probs=False)
       # This was checked to not be None at construction time.
       static_event_rank = self.get_event_shape().ndims
@@ -271,7 +217,7 @@ class Mixture(distribution.Distribution):
   def _log_prob(self, x):
     with ops.control_dependencies(self._assertions):
       x = ops.convert_to_tensor(x, name="x")
-      distribution_log_probs = [d.log_prob(x) for d in self.distributions]
+      distribution_log_probs = [d.log_prob(x) for d in self.components]
       cat_log_probs = self._cat_probs(log_probs=True)
       final_log_probs = [
           cat_lp + d_lp
@@ -351,7 +297,7 @@ class Mixture(distribution.Distribution):
       samples_class = [None for _ in range(self.num_components)]
       for c in range(self.num_components):
         n_class = array_ops.size(partitioned_samples_indices[c])
-        samples_class_c = self.distributions[c].sample_n(n_class, seed=seed)
+        samples_class_c = self.components[c].sample_n(n_class, seed=seed)
 
         # Pull out the correct batch entries from each index.
         # To do this, we may have to flatten the batch shape.
@@ -395,7 +341,7 @@ class Mixture(distribution.Distribution):
     r"""A lower bound on the entropy of this mixture model.
 
     The bound below is not always very tight, and its usefulness depends
-    on the mixture probabilities and the distributions in use.
+    on the mixture probabilities and the components in use.
 
     A lower bound is useful for ELBO when the `Mixture` is the variational
     distribution:
@@ -432,7 +378,7 @@ class Mixture(distribution.Distribution):
     """
     with self._name_scope(name, values=[self.cat.logits]):
       with ops.control_dependencies(self._assertions):
-        distribution_entropies = [d.entropy() for d in self.distributions]
+        distribution_entropies = [d.entropy() for d in self.components]
         cat_probs = self._cat_probs(log_probs=False)
         partial_entropies = [
             c_p * m for (c_p, m) in zip(cat_probs, distribution_entropies)
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op.cc b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
index 10e35e165b2..15a31f9d311 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
@@ -35,7 +35,7 @@ namespace {
 // The complete set of audio file formats that are supported by the op. These
 // strings are defined by FFmpeg and documented here:
 // https://www.ffmpeg.org/ffmpeg-formats.html
-const char* kValidFileFormats[] = {"mp3", "ogg", "wav"};
+const char* kValidFileFormats[] = {"mp3", "mp4", "ogg", "wav"};
 
 // Writes binary data to a file.
 Status WriteFile(const string& filename, tensorflow::StringPiece contents) {
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op_test.py b/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
index 58d0ab11b1d..6e85d360ccb 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
@@ -61,10 +61,30 @@ class DecodeAudioOpTest(tf.test.TestCase):
     self._loadFileAndTest('mono_16khz.mp3', 'mp3', 0.57, 20000, 1)
     self._loadFileAndTest('mono_16khz.mp3', 'mp3', 0.57, 20000, 2)
 
+  def testMonoMp4Mp3Codec(self):
+    # mp3 compressed audio streams in mp4 container.
+    self._loadFileAndTest('mono_16khz_mp3.mp4', 'mp4', 2.77, 20000, 1)
+    self._loadFileAndTest('mono_16khz_mp3.mp4', 'mp4', 2.77, 20000, 2)
+
+  def testMonoMp4AacCodec(self):
+    # aac compressed audio streams in mp4 container.
+    self._loadFileAndTest('mono_32khz_aac.mp4', 'mp4', 2.77, 20000, 1)
+    self._loadFileAndTest('mono_32khz_aac.mp4', 'mp4', 2.77, 20000, 2)
+
   def testStereoMp3(self):
     self._loadFileAndTest('stereo_48khz.mp3', 'mp3', 0.79, 50000, 1)
     self._loadFileAndTest('stereo_48khz.mp3', 'mp3', 0.79, 20000, 2)
 
+  def testStereoMp4Mp3Codec(self):
+    # mp3 compressed audio streams in mp4 container.
+    self._loadFileAndTest('stereo_48khz_mp3.mp4', 'mp4', 0.79, 50000, 1)
+    self._loadFileAndTest('stereo_48khz_mp3.mp4', 'mp4', 0.79, 20000, 2)
+
+  def testStereoMp4AacCodec(self):
+    # aac compressed audio streams in mp4 container.
+    self._loadFileAndTest('stereo_48khz_aac.mp4', 'mp4', 0.79, 50000, 1)
+    self._loadFileAndTest('stereo_48khz_aac.mp4', 'mp4', 0.79, 20000, 2)
+
   def testMonoWav(self):
     self._loadFileAndTest('mono_10khz.wav', 'wav', 0.57, 5000, 1)
     self._loadFileAndTest('mono_10khz.wav', 'wav', 0.57, 10000, 4)
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 4902e3b7128..c3c921eb996 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -35,11 +35,14 @@ def decode_audio(contents, file_format=None, samples_per_second=None,
                  channel_count=None):
   """Create an op that decodes the contents of an audio file.
 
+  Note that ffmpeg is free to select the "best" audio track from an mp4.
+  https://trac.ffmpeg.org/wiki/Map
+
   Args:
     contents: The binary contents of the audio file to decode. This is a
         scalar.
     file_format: A string specifying which format the contents will conform
-        to. This can be mp3, ogg, or wav.
+        to. This can be mp3, mp4, ogg, or wav.
     samples_per_second: The number of samples per second that is assumed.
         In some cases, resampling will occur to generate the correct sample
         rate.
diff --git a/tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3.mp4 b/tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3.mp4
new file mode 100644
index 00000000000..424f4b6e1a7
Binary files /dev/null and b/tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3.mp4 differ
diff --git a/tensorflow/contrib/ffmpeg/testdata/mono_32khz_aac.mp4 b/tensorflow/contrib/ffmpeg/testdata/mono_32khz_aac.mp4
new file mode 100644
index 00000000000..6577e6f58af
Binary files /dev/null and b/tensorflow/contrib/ffmpeg/testdata/mono_32khz_aac.mp4 differ
diff --git a/tensorflow/contrib/ffmpeg/testdata/stereo_48khz_aac.mp4 b/tensorflow/contrib/ffmpeg/testdata/stereo_48khz_aac.mp4
new file mode 100644
index 00000000000..bc71bf95e8e
Binary files /dev/null and b/tensorflow/contrib/ffmpeg/testdata/stereo_48khz_aac.mp4 differ
diff --git a/tensorflow/contrib/ffmpeg/testdata/stereo_48khz_mp3.mp4 b/tensorflow/contrib/ffmpeg/testdata/stereo_48khz_mp3.mp4
new file mode 100644
index 00000000000..9f9b2072f80
Binary files /dev/null and b/tensorflow/contrib/ffmpeg/testdata/stereo_48khz_mp3.mp4 differ
diff --git a/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py b/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py
index 78870ff6787..a2883c39d37 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py
@@ -125,7 +125,8 @@ class ClassifierTest(tf.test.TestCase):
       default_signature = signatures.default_signature
       return default_signature
 
-  def testExportMonitorRegressionSignature(self):
+  # Disable this test case until b/31032996 is fixed.
+  def _testExportMonitorRegressionSignature(self):
     iris = tf.contrib.learn.datasets.load_iris()
     est = tf.contrib.learn.Classifier(model_fn=logistic_model_fn, n_classes=3)
     export_dir = tempfile.mkdtemp() + 'export/'
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index fa1e41d9824..c1dfc262b63 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -19,11 +19,269 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import tempfile
+
+from tensorflow.contrib import layers
+from tensorflow.contrib import metrics as metrics_lib
+from tensorflow.contrib.framework import deprecated
+from tensorflow.contrib.framework import deprecated_arg_values
+from tensorflow.contrib.framework import list_variables
+from tensorflow.contrib.framework import load_variable
+from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.layers.python.layers import optimizers
+from tensorflow.contrib.learn.python.learn import evaluable
+from tensorflow.contrib.learn.python.learn import metric_spec
+from tensorflow.contrib.learn.python.learn import session_run_hook
+from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import dnn_linear_combined
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.utils import checkpoints
+from tensorflow.contrib.learn.python.learn.utils import export
+from tensorflow.contrib.losses.python.losses import loss_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import training as train
 
 
-class DNNClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
+_CENTERED_BIAS = "centered_bias"
+_CENTERED_BIAS_WEIGHT = "centered_bias_weight"
+_CLASSES = "classes"
+_LOGISTIC = "logistic"
+_PROBABILITIES = "probabilities"
+
+# The default learning rate of 0.05 is a historical artifact of the initial
+# implementation, but seems a reasonable choice.
+_LEARNING_RATE = 0.05
+
+
+def _as_iterable(preds, output):
+  for pred in preds:
+    yield pred[output]
+
+
+def _get_feature_dict(features):
+  if isinstance(features, dict):
+    return features
+  return {"": features}
+
+
+def _get_optimizer(optimizer):
+  if callable(optimizer):
+    return optimizer()
+  else:
+    return optimizer
+
+
+def _add_hidden_layer_summary(value, tag):
+  logging_ops.scalar_summary("%s:fraction_of_zero_values" % tag,
+                             nn.zero_fraction(value))
+  logging_ops.histogram_summary("%s:activation" % tag, value)
+
+
+def _centered_bias(num_label_columns):
+  centered_bias = variables.Variable(
+      array_ops.zeros([num_label_columns]),
+      collections=[_CENTERED_BIAS, ops.GraphKeys.VARIABLES],
+      name=_CENTERED_BIAS_WEIGHT)
+  logging_ops.scalar_summary(
+      ["centered_bias %d" % cb for cb in range(num_label_columns)],
+      array_ops.reshape(centered_bias, [-1]))
+  return centered_bias
+
+
+def _centered_bias_step(targets, loss_fn, num_label_columns):
+  centered_bias = ops.get_collection(_CENTERED_BIAS)
+  batch_size = array_ops.shape(targets)[0]
+  logits = array_ops.reshape(
+      array_ops.tile(centered_bias[0], [batch_size]),
+      [batch_size, num_label_columns])
+  loss = loss_fn(logits, targets)
+  return train.AdagradOptimizer(0.1).minimize(loss, var_list=centered_bias)
+
+
+def _get_weight_tensor(features, weight_column_name):
+  """Returns the weight tensor of shape [batch_size] or 1."""
+  if weight_column_name is None:
+    return 1.0
+  else:
+    return array_ops.reshape(
+        math_ops.to_float(features[weight_column_name]),
+        shape=(-1,))
+
+
+def _rescale_eval_loss(loss, weights):
+  """Rescales evaluation loss according to the given weights.
+
+  The rescaling is needed because in the training loss weights are not
+  considered in the denominator, whereas  for the evaluation loss we should
+  divide by the sum of weights.
+
+  The rescaling factor is:
+    R = sum_{i} 1 / sum_{i} w_{i}
+
+  Args:
+    loss: the scalar weighted loss.
+    weights: weight coefficients. Either a scalar, or a `Tensor` of shape
+      [batch_size].
+
+  Returns:
+    The given loss multiplied by the rescaling factor.
+  """
+  rescaling_factor = math_ops.reduce_mean(weights)
+  return math_ops.div(loss, rescaling_factor)
+
+
+def _predictions(logits, n_classes):
+  """Returns predictions for the given logits and n_classes."""
+  predictions = {}
+  if n_classes == 2:
+    predictions[_LOGISTIC] = math_ops.sigmoid(logits)
+    logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits])
+  predictions[_PROBABILITIES] = nn.softmax(logits)
+  predictions[_CLASSES] = array_ops.reshape(
+      math_ops.argmax(logits, 1), shape=(-1, 1))
+  return predictions
+
+
+def _dnn_classifier_model_fn(features, targets, mode, params):
+  """Deep Neural Net model_fn.
+
+  Args:
+    features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`).
+    targets: `Tensor` of shape [batch_size, 1] or [batch_size] target labels of
+      dtype `int32` or `int64` in the range `[0, n_classes)`.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    params: A dict of hyperparameters.
+      The following hyperparameters are expected:
+      * hidden_units: List of hidden units per layer.
+      * feature_columns: An iterable containing all the feature columns used by
+          the model.
+      * n_classes: number of target classes.
+      * weight_column_name: A string defining the weight feature column, or
+          None if there are no weights.
+      * optimizer: string, `Optimizer` object, or callable that defines the
+          optimizer to use for training.
+      * activation_fn: Activation function applied to each layer. If `None`,
+          will use `tf.nn.relu`.
+      * dropout: When not `None`, the probability we will drop out a given
+          coordinate.
+      * gradient_clip_norm: A float > 0. If provided, gradients are
+          clipped to their global norm with this clipping ratio.
+      * enable_centered_bias: A bool. If True, estimator will learn a centered
+          bias variable for each class. Rest of the model structure learns the
+          residual after centered bias.
+      * num_ps_replicas: The number of parameter server replicas.
+
+  Returns:
+    predictions: A dict of `Tensor` objects.
+    loss: A scalar containing the loss of the step.
+    train_op: The op for training.
+  """
+  hidden_units = params["hidden_units"]
+  feature_columns = params["feature_columns"]
+  n_classes = params["n_classes"]
+  weight_column_name = params["weight_column_name"]
+  optimizer = params["optimizer"]
+  activation_fn = params["activation_fn"]
+  dropout = params["dropout"]
+  gradient_clip_norm = params["gradient_clip_norm"]
+  enable_centered_bias = params["enable_centered_bias"]
+  num_ps_replicas = params["num_ps_replicas"]
+
+  features = _get_feature_dict(features)
+  parent_scope = "dnn"
+  num_label_columns = 1 if n_classes == 2 else n_classes
+  if n_classes == 2:
+    loss_fn = loss_ops.sigmoid_cross_entropy
+  else:
+    loss_fn = loss_ops.sparse_softmax_cross_entropy
+
+  input_layer_partitioner = (
+      partitioned_variables.min_max_variable_partitioner(
+          max_partitions=num_ps_replicas,
+          min_slice_size=64 << 20))
+  with variable_scope.variable_scope(
+      parent_scope + "/input_from_feature_columns",
+      values=features.values(),
+      partitioner=input_layer_partitioner) as scope:
+    net = layers.input_from_feature_columns(
+        columns_to_tensors=features,
+        feature_columns=feature_columns,
+        weight_collections=[parent_scope],
+        scope=scope)
+
+  hidden_layer_partitioner = (
+      partitioned_variables.min_max_variable_partitioner(
+          max_partitions=num_ps_replicas))
+  for layer_id, num_hidden_units in enumerate(hidden_units):
+    with variable_scope.variable_scope(
+        parent_scope + "/hiddenlayer_%d" % layer_id,
+        values=[net],
+        partitioner=hidden_layer_partitioner) as scope:
+      net = layers.fully_connected(
+          net,
+          num_hidden_units,
+          activation_fn=activation_fn,
+          variables_collections=[parent_scope],
+          scope=scope)
+      if dropout is not None and mode == estimator.ModeKeys.TRAIN:
+        net = layers.dropout(
+            net,
+            keep_prob=(1.0 - dropout))
+    _add_hidden_layer_summary(net, scope.name)
+
+  with variable_scope.variable_scope(
+      parent_scope + "/logits",
+      values=[net],
+      partitioner=hidden_layer_partitioner) as scope:
+    logits = layers.fully_connected(
+        net,
+        num_label_columns,
+        activation_fn=None,
+        variables_collections=[parent_scope],
+        scope=scope)
+  _add_hidden_layer_summary(logits, scope.name)
+
+  if enable_centered_bias:
+    logits = nn.bias_add(logits, _centered_bias(num_label_columns))
+
+  if mode == estimator.ModeKeys.TRAIN:
+    loss = loss_fn(logits, targets,
+                   weight=_get_weight_tensor(features, weight_column_name))
+
+    train_ops = [optimizers.optimize_loss(
+        loss=loss, global_step=contrib_variables.get_global_step(),
+        learning_rate=_LEARNING_RATE, optimizer=_get_optimizer(optimizer),
+        clip_gradients=gradient_clip_norm, name=parent_scope)]
+    if enable_centered_bias:
+      train_ops.append(_centered_bias_step(targets, loss_fn, num_label_columns))
+
+    return None, loss, control_flow_ops.group(*train_ops)
+
+  elif mode == estimator.ModeKeys.EVAL:
+    predictions = _predictions(logits=logits, n_classes=n_classes)
+
+    weight = _get_weight_tensor(features, weight_column_name)
+    training_loss = loss_fn(logits, targets, weight=weight)
+    loss = _rescale_eval_loss(training_loss, weight)
+
+    return predictions, loss, []
+
+  else:  # mode == estimator.ModeKeys.INFER:
+    predictions = _predictions(logits=logits, n_classes=n_classes)
+
+    return predictions, None, []
+
+
+class DNNClassifier(evaluable.Evaluable, trainable.Trainable):
   """A classifier for TensorFlow DNN models.
 
   Example:
@@ -124,36 +382,211 @@ class DNNClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
 
     Returns:
       A `DNNClassifier` estimator.
+
+    Raises:
+      ValueError: If `n_classes` < 2.
     """
     if enable_centered_bias is None:
       enable_centered_bias = True
       dnn_linear_combined._changing_default_center_bias()  # pylint: disable=protected-access
-    super(DNNClassifier, self).__init__(
-        model_dir=model_dir,
-        n_classes=n_classes,
-        weight_column_name=weight_column_name,
-        dnn_feature_columns=feature_columns,
-        dnn_optimizer=optimizer,
-        dnn_hidden_units=hidden_units,
-        dnn_activation_fn=activation_fn,
-        dnn_dropout=dropout,
-        gradient_clip_norm=gradient_clip_norm,
-        enable_centered_bias=enable_centered_bias,
-        config=config)
-    self.feature_columns = feature_columns
-    self.optimizer = optimizer
-    self.activation_fn = activation_fn
-    self.dropout = dropout
-    self.hidden_units = hidden_units
-    self._feature_columns_inferred = False
+    self._hidden_units = hidden_units
+    self._feature_columns = feature_columns
+    self._model_dir = model_dir or tempfile.mkdtemp()
+    if n_classes <= 1:
+      raise ValueError(
+          "Classification requires n_classes >= 2. Given: {}".format(n_classes))
+    self._n_classes = n_classes
+    self._weight_column_name = weight_column_name
+    optimizer = optimizer or "Adagrad"
+    num_ps_replicas = config.num_ps_replicas if config else 0
+
+    self._estimator = estimator.Estimator(
+        model_fn=_dnn_classifier_model_fn,
+        model_dir=self._model_dir,
+        config=config,
+        params={
+            "hidden_units": hidden_units,
+            "feature_columns": feature_columns,
+            "n_classes": n_classes,
+            "weight_column_name": weight_column_name,
+            "optimizer": optimizer,
+            "activation_fn": activation_fn,
+            "dropout": dropout,
+            "gradient_clip_norm": gradient_clip_norm,
+            "enable_centered_bias": enable_centered_bias,
+            "num_ps_replicas": num_ps_replicas,
+        })
+
+  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
+          monitors=None, max_steps=None):
+    """See trainable.Trainable."""
+    # TODO(roumposg): Remove when deprecated monitors are removed.
+    if monitors is not None:
+      deprecated_monitors = [
+          m for m in monitors
+          if not isinstance(m, session_run_hook.SessionRunHook)
+      ]
+      for monitor in deprecated_monitors:
+        monitor.set_estimator(self)
+        monitor._lock_estimator()  # pylint: disable=protected-access
+
+    result = self._estimator.fit(x=x, y=y, input_fn=input_fn, steps=steps,
+                                 batch_size=batch_size, monitors=monitors,
+                                 max_steps=max_steps)
+
+    if monitors is not None:
+      for monitor in deprecated_monitors:
+        monitor._unlock_estimator()  # pylint: disable=protected-access
+
+    return result
+
+  def evaluate(self, x=None, y=None, input_fn=None, feed_fn=None,
+               batch_size=None, steps=None, metrics=None, name=None):
+    """See evaluable.Evaluable."""
+    if metrics is None:
+      metrics = {}
+    metrics.update({
+        "accuracy": metric_spec.MetricSpec(
+            metric_fn=metrics_lib.streaming_accuracy,
+            prediction_key=_CLASSES,
+            weight_key=self._weight_column_name)})
+    if self._n_classes == 2:
+      metrics.update({
+          "auc": metric_spec.MetricSpec(
+              metric_fn=metrics_lib.streaming_auc,
+              prediction_key=_LOGISTIC,
+              weight_key=self._weight_column_name)})
+    return self._estimator.evaluate(
+        x=x, y=y, input_fn=input_fn, feed_fn=feed_fn, batch_size=batch_size,
+        steps=steps, metrics=metrics, name=name)
+
+  @deprecated_arg_values(
+      estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
+      as_iterable=False)
+  def predict(self, x=None, input_fn=None, batch_size=None, as_iterable=False):
+    """Returns predicted classes for given features.
+
+    Args:
+      x: features.
+      input_fn: Input function. If set, x must be None.
+      batch_size: Override default batch size.
+      as_iterable: If True, return an iterable which keeps yielding predictions
+        for each example until inputs are exhausted. Note: The inputs must
+        terminate if you want the iterable to terminate (e.g. be sure to pass
+        num_epochs=1 if you are using something like read_batch_features).
+
+    Returns:
+      Numpy array of predicted classes (or an iterable of predicted classes if
+      as_iterable is True).
+    """
+    preds = self._estimator.predict(x=x, input_fn=input_fn,
+                                    batch_size=batch_size, outputs=[_CLASSES],
+                                    as_iterable=as_iterable)
+    if as_iterable:
+      return _as_iterable(preds, output=_CLASSES)
+    return preds[_CLASSES].reshape(-1)
+
+  @deprecated_arg_values(
+      estimator.AS_ITERABLE_DATE, estimator.AS_ITERABLE_INSTRUCTIONS,
+      as_iterable=False)
+  def predict_proba(
+      self, x=None, input_fn=None, batch_size=None, as_iterable=False):
+    """Returns prediction probabilities for given features.
+
+    Args:
+      x: features.
+      input_fn: Input function. If set, x and y must be None.
+      batch_size: Override default batch size.
+      as_iterable: If True, return an iterable which keeps yielding predictions
+        for each example until inputs are exhausted. Note: The inputs must
+        terminate if you want the iterable to terminate (e.g. be sure to pass
+        num_epochs=1 if you are using something like read_batch_features).
+
+    Returns:
+      Numpy array of predicted probabilities (or an iterable of predicted
+      probabilities if as_iterable is True).
+    """
+    preds = self._estimator.predict(x=x, input_fn=input_fn,
+                                    batch_size=batch_size,
+                                    outputs=[_PROBABILITIES],
+                                    as_iterable=as_iterable)
+    if as_iterable:
+      return _as_iterable(preds, output=_PROBABILITIES)
+    return preds[_PROBABILITIES]
+
+  def get_variable_names(self):
+    """Returns list of all variable names in this model.
+
+    Returns:
+      List of names.
+    """
+    return [name for name, _ in list_variables(self._model_dir)]
+
+  def get_variable_value(self, name):
+    """Returns value of the variable given by name.
+
+    Args:
+      name: string, name of the tensor.
+
+    Returns:
+      `Tensor` object.
+    """
+    return load_variable(self._model_dir, name)
+
+  def export(self,
+             export_dir,
+             input_fn=None,
+             input_feature_key=None,
+             use_deprecated_input_fn=True,
+             signature_fn=None,
+             default_batch_size=1,
+             exports_to_keep=None):
+    """See BasEstimator.export."""
+    def default_input_fn(unused_estimator, examples):
+      return layers.parse_feature_columns_from_examples(
+          examples, self._feature_columns)
+    self._estimator.export(
+        export_dir=export_dir,
+        input_fn=input_fn or default_input_fn,
+        input_feature_key=input_feature_key,
+        use_deprecated_input_fn=use_deprecated_input_fn,
+        signature_fn=(
+            signature_fn or export.classification_signature_fn_with_prob),
+        prediction_key=_PROBABILITIES,
+        default_batch_size=default_batch_size,
+        exports_to_keep=exports_to_keep)
 
   @property
+  def model_dir(self):
+    return self._model_dir
+
+  @property
+  @deprecated("2016-10-13", "This method inspects the private state of the "
+              "object, and should not be used")
   def weights_(self):
-    return self.dnn_weights_
+    hiddenlayer_weights = [checkpoints.load_variable(
+        self._model_dir, name=("dnn/hiddenlayer_%d/weights" % i))
+                           for i, _ in enumerate(self._hidden_units)]
+    logits_weights = [checkpoints.load_variable(
+        self._model_dir, name="dnn/logits/weights")]
+    return hiddenlayer_weights + logits_weights
 
   @property
+  @deprecated("2016-10-13", "This method inspects the private state of the "
+              "object, and should not be used")
   def bias_(self):
-    return self.dnn_bias_
+    hiddenlayer_bias = [checkpoints.load_variable(
+        self._model_dir, name=("dnn/hiddenlayer_%d/biases" % i))
+                        for i, _ in enumerate(self._hidden_units)]
+    logits_bias = [checkpoints.load_variable(
+        self._model_dir, name="dnn/logits/biases")]
+    centered_bias = [checkpoints.load_variable(
+        self._model_dir, name=_CENTERED_BIAS_WEIGHT)]
+    return hiddenlayer_bias + logits_bias + centered_bias
+
+  @property
+  def config(self):
+    return self._estimator.config
 
 
 class DNNRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 64d2fe6d70c..94145da0f15 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -27,13 +27,8 @@ import tensorflow as tf
 
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn
 from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils
-
-# pylint: disable=g-import-not-at-top
-try:
-  from sklearn.cross_validation import cross_val_score
-  HAS_SKLEARN = True
-except ImportError:
-  HAS_SKLEARN = False
+from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
+from tensorflow.python.ops import math_ops
 
 
 def _prepare_iris_data_for_logistic_regression():
@@ -350,6 +345,7 @@ class DNNClassifierTest(tf.test.TestCase):
       # For the case of binary classification, the 2nd column of "predictions"
       # denotes the model predictions.
       predictions = tf.slice(predictions, [0, 1], [-1, 1])
+      targets = math_ops.cast(targets, predictions.dtype)
       return tf.reduce_sum(tf.mul(predictions, targets))
 
     classifier = tf.contrib.learn.DNNClassifier(
@@ -362,9 +358,15 @@ class DNNClassifierTest(tf.test.TestCase):
         input_fn=_input_fn_train,
         steps=100,
         metrics={
-            'my_accuracy': tf.contrib.metrics.streaming_accuracy,
-            ('my_precision', 'classes'): tf.contrib.metrics.streaming_precision,
-            ('my_metric', 'probabilities'): _my_metric_op
+            'my_accuracy': MetricSpec(
+                metric_fn=tf.contrib.metrics.streaming_accuracy,
+                prediction_key='classes'),
+            'my_precision': MetricSpec(
+                metric_fn=tf.contrib.metrics.streaming_precision,
+                prediction_key='classes'),
+            'my_metric': MetricSpec(
+                metric_fn=_my_metric_op,
+                prediction_key='probabilities')
         })
     self.assertTrue(
         set(['loss', 'my_accuracy', 'my_precision', 'my_metric'
@@ -375,21 +377,14 @@ class DNNClassifierTest(tf.test.TestCase):
 
     # Test the case where the 2nd element of the key is neither "classes" nor
     # "probabilities".
-    with self.assertRaises(ValueError):
-      classifier.evaluate(
-          input_fn=_input_fn_train,
-          steps=100,
-          metrics={('bad_name', 'bad_type'): tf.contrib.metrics.streaming_auc})
-
-    # Test the case where the tuple of the key doesn't have 2 elements.
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(KeyError, 'bad_type'):
       classifier.evaluate(
           input_fn=_input_fn_train,
           steps=100,
           metrics={
-              ('bad_length_name', 'classes', 'bad_length'):
-                  tf.contrib.metrics.streaming_accuracy
-          })
+              'bad_name': MetricSpec(
+                  metric_fn=tf.contrib.metrics.streaming_auc,
+                  prediction_key='bad_type')})
 
   def testTrainSaveLoad(self):
     """Tests that insures you can save and reload a trained model."""
@@ -466,6 +461,31 @@ class DNNClassifierTest(tf.test.TestCase):
     self.assertGreater(scores['accuracy'], 0.9)
     self.assertLess(scores['loss'], 0.3)
 
+  def testExport(self):
+    """Tests export model for servo."""
+
+    def input_fn():
+      return {
+          'age': tf.constant([1]),
+          'language': tf.SparseTensor(values=['english'],
+                                      indices=[[0, 0]],
+                                      shape=[1, 1])
+      }, tf.constant([[1]])
+
+    language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 100)
+    feature_columns = [
+        tf.contrib.layers.real_valued_column('age'),
+        tf.contrib.layers.embedding_column(language, dimension=1)
+    ]
+
+    classifier = tf.contrib.learn.DNNClassifier(
+        feature_columns=feature_columns,
+        hidden_units=[3, 3])
+    classifier.fit(input_fn=input_fn, steps=100)
+
+    export_dir = tempfile.mkdtemp()
+    classifier.export(export_dir)
+
   def testDisableCenteredBias(self):
     """Tests that we can disable centered bias."""
     cont_features = [
@@ -484,32 +504,6 @@ class DNNClassifierTest(tf.test.TestCase):
     self.assertGreater(scores['accuracy'], 0.8)
     self.assertLess(scores['loss'], 0.3)
 
-  def testSklearnCompatibility(self):
-    """Tests compatibility with sklearn"""
-    if not HAS_SKLEARN:
-      return
-    iris = tf.contrib.learn.datasets.load_iris()
-
-    cont_features = [
-        tf.contrib.layers.real_valued_column('', dimension=4)]
-    kwargs = {
-        'n_classes': 3,
-        'feature_columns': cont_features,
-        'optimizer' : 'Adam',
-        'hidden_units' : [3, 4]
-    }
-
-    classifier = tf.contrib.learn.DNNClassifier(**kwargs)
-
-    scores = cross_val_score(
-      classifier,
-      iris.data[1:5],
-      iris.target[1:5],
-      scoring='accuracy',
-      fit_params={'steps': 100}
-    )
-    self.assertAllClose(scores, [1, 1, 1])
-
 
 class DNNRegressorTest(tf.test.TestCase):
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index ab3de2251a1..f20d2483951 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -234,7 +234,7 @@ def read_keyed_batch_features(file_pattern,
                               queue_capacity=10000,
                               reader_num_threads=1,
                               feature_queue_capacity=100,
-                              num_enqueue_threads=2,
+                              num_queue_runners=2,
                               parser_num_threads=None,
                               parse_fn=None,
                               name=None):
@@ -266,8 +266,8 @@ def read_keyed_batch_features(file_pattern,
     queue_capacity: Capacity for input queue.
     reader_num_threads: The number of threads to read examples.
     feature_queue_capacity: Capacity of the parsed features queue.
-    num_enqueue_threads: Number of threads to enqueue the parsed example queue.
-      Using multiple threads to enqueue the parsed example queue helps maintain
+    num_queue_runners: Number of queue runners to start for the feature queue,
+      Adding multiple queue runners for the parsed example queue helps maintain
       a full queue when the subsequent computations overall are cheaper than
       parsing.
     parser_num_threads: (Deprecated) The number of threads to parse examples.
@@ -300,14 +300,14 @@ def read_keyed_batch_features(file_pattern,
         feature_map,
         keys=keys,
         feature_queue_capacity=feature_queue_capacity,
-        num_enqueue_threads=num_enqueue_threads,
+        num_queue_runners=num_queue_runners,
         name=scope)
 
 
 def queue_parsed_features(parsed_features,
                           keys=None,
                           feature_queue_capacity=100,
-                          num_enqueue_threads=2,
+                          num_queue_runners=2,
                           name=None):
   """Speeds up parsing by using queues to do it asynchronously.
 
@@ -326,8 +326,8 @@ def queue_parsed_features(parsed_features,
     parsed_features: A dict of string key to `Tensor` or `SparseTensor` objects.
     keys: `Tensor` of string keys.
     feature_queue_capacity: Capacity of the parsed features queue.
-    num_enqueue_threads: Number of threads to enqueue the parsed example queue.
-      Using multiple thrads to enqueue the parsed example queue helps maintain
+    num_queue_runners: Number of queue runners to start for the feature queue,
+      Adding multiple queue runners for the parsed example queue helps maintain
       a full queue when the subsequent computations overall are cheaper than
       parsing.
     name: Name of resulting op.
@@ -374,14 +374,14 @@ def queue_parsed_features(parsed_features,
                                math_ops.cast(input_queue.size(), dtypes.float32)
                                * (1. / feature_queue_capacity))
 
-    # Use multiple threads to enqueue so the queue is always full. Adding more
-    # than two threads may hog the cpu on the worker to fill up the queue.
-    enqueue_ops = [input_queue.enqueue(tensors_to_enqueue)
-                   for _ in range(num_enqueue_threads)]
-    queue_runner.add_queue_runner(queue_runner.QueueRunner(
-        input_queue, enqueue_ops,
-        queue_closed_exception_types=(errors.OutOfRangeError,
-                                      errors.CancelledError)))
+    # Add multiple queue runners so that the queue is always full. Adding more
+    # than two queue-runners may hog the cpu on the worker to fill up the queue.
+    for _ in range(num_queue_runners):
+      queue_runner.add_queue_runner(
+          queue_runner.QueueRunner(
+              input_queue, [input_queue.enqueue(tensors_to_enqueue)],
+              queue_closed_exception_types=(errors.OutOfRangeError,
+                                            errors.CancelledError)))
 
     dequeued_tensors = input_queue.dequeue()
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner.py b/tensorflow/contrib/learn/python/learn/learn_runner.py
index b473b99c8f6..9f9b10b5df9 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner.py
@@ -83,8 +83,6 @@ def run(experiment_fn, output_dir, schedule=None):
   # Get the schedule
   config = experiment.estimator.config
   schedule = schedule or _get_default_schedule(config)
-  if not schedule:
-    raise ValueError('Must specify a schedule')
 
   # Execute the schedule
   if not hasattr(experiment, schedule):
@@ -107,19 +105,36 @@ def run(experiment_fn, output_dir, schedule=None):
   return task()
 
 
+def _is_distributed(config):
+  """Returns true if this is a distributed job."""
+  if not config.cluster_spec:
+    return False
+
+  # This is considered a distributed job if there is more than one task
+  # in the cluster spec.
+  task_count = 0
+  for job in config.cluster_spec.jobs:
+    for _ in config.cluster_spec.job_tasks(job):
+      task_count += 1
+
+  return task_count > 1
+
+
 def _get_default_schedule(config):
   """Returns the default schedule for the provided RunConfig."""
-  if not config or not config.job_name:
-    return None
+  if not config or not _is_distributed(config):
+    return 'local_run'
 
-  if not config.job_name or config.job_name == 'master':
-    # TODO(rhaertel): handle the case there are more
-    # than one masters or explicitly disallow.
+  if not config.job_name:
+    raise ValueError('Must specify a schedule')
+
+  if config.job_name == 'master':
+    # TODO(rhaertel): handle the case where there is more than one master
+    # or explicitly disallow such a case.
     return 'local_run'
   elif config.job_name == 'ps':
     return 'run_std_server'
   elif config.job_name == 'worker':
     return 'train'
 
-  return ValueError('No default schedule for task type: %s' %
-                    (config.job_name,))
+  raise ValueError('No default schedule for task type: %s' % (config.job_name,))
diff --git a/tensorflow/contrib/learn/python/learn/models.py b/tensorflow/contrib/learn/python/learn/models.py
index be55dbb9f31..749eeadb1f7 100644
--- a/tensorflow/contrib/learn/python/learn/models.py
+++ b/tensorflow/contrib/learn/python/learn/models.py
@@ -335,7 +335,7 @@ def get_rnn_model(rnn_size, cell_type, num_layers, input_op_fn, bidirectional,
           fw_cell, attn_length=attn_length, attn_size=attn_size,
           attn_vec_size=attn_vec_size, state_is_tuple=False)
         bw_cell = contrib_rnn.AttentionCellWrapper(
-          fw_cell, attn_length=attn_length, attn_size=attn_size,
+          bw_cell, attn_length=attn_length, attn_size=attn_size,
           attn_vec_size=attn_vec_size, state_is_tuple=False)
       rnn_fw_cell = nn.rnn_cell.MultiRNNCell([fw_cell] * num_layers,
                                              state_is_tuple=False)
diff --git a/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py b/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py
index e7a16b002ff..519d6377ea6 100644
--- a/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py
@@ -39,7 +39,7 @@ class TestExperiment(tf.contrib.learn.Experiment):
     return Estimator()
 
   def local_run(self):
-    return "train_and_evaluate"
+    return "local_run"
 
   def train(self):
     return "train"
@@ -62,6 +62,18 @@ def build_non_experiment(output_dir):
 # pylint: enable=unused-argument
 
 
+def build_distributed_cluster_spec():
+  return tf.train.ClusterSpec(
+      {"ps": ["localhost:1234", "localhost:1235"],
+       "worker": ["localhost:1236", "localhost:1237"],
+       "master": ["localhost:1238"],
+       "foo_has_no_default_schedule": ["localhost:1239"]})
+
+
+def build_non_distributed_cluster_spec():
+  return tf.train.ClusterSpec({"foo": ["localhost:1234"]})
+
+
 class MainTest(tf.test.TestCase):
 
   def setUp(self):
@@ -76,7 +88,9 @@ class MainTest(tf.test.TestCase):
                          schedule="simple_task"))
 
   def test_schedule_from_tf_config(self):
-    os.environ["TF_CONFIG"] = json.dumps({"task": {"type": "worker"}})
+    os.environ["TF_CONFIG"] = json.dumps(
+        {"cluster": build_distributed_cluster_spec().as_dict(),
+         "task": {"type": "worker"}})
     # RunConfig constructuor will set job_name from TF_CONFIG.
     config = run_config.RunConfig()
     self.assertEqual(
@@ -85,28 +99,35 @@ class MainTest(tf.test.TestCase):
                          output_dir="/tmp"))
 
   def test_schedule_from_manually_specified_job_name(self):
-    config = run_config.RunConfig(job_name="worker")
+    config = run_config.RunConfig(
+        job_name="worker", cluster_spec=build_distributed_cluster_spec())
     self.assertEqual(
         "train",
         learn_runner.run(lambda output_dir: TestExperiment(config=config),
                          output_dir="/tmp"))
 
-  def test_schedule_from_config_runs_train_and_evaluate_on_master(self):
-    config = run_config.RunConfig(job_name="master", task=0, is_chief=True)
+  def test_schedule_from_config_runs_local_run_on_master(self):
+    config = run_config.RunConfig(
+        job_name="master",
+        cluster_spec=build_distributed_cluster_spec(),
+        task=0,
+        is_chief=True)
     self.assertEqual(
-        "train_and_evaluate",
+        "local_run",
         learn_runner.run(lambda output_dir: TestExperiment(config=config),
                          output_dir="/tmp"))
 
   def test_schedule_from_config_runs_serve_on_ps(self):
-    config = run_config.RunConfig(job_name="ps")
+    config = run_config.RunConfig(
+        job_name="ps", cluster_spec=build_distributed_cluster_spec())
     self.assertEqual(
         "run_std_server",
         learn_runner.run(lambda output_dir: TestExperiment(config=config),
                          output_dir="/tmp"))
 
   def test_schedule_from_config_runs_train_on_worker(self):
-    config = run_config.RunConfig(job_name="worker")
+    config = run_config.RunConfig(
+        job_name="worker", cluster_spec=build_distributed_cluster_spec())
     self.assertEqual(
         "train",
         learn_runner.run(lambda output_dir: TestExperiment(config=config),
@@ -117,13 +138,27 @@ class MainTest(tf.test.TestCase):
                             learn_runner.run, build_experiment, "",
                             "simple_task")
 
-  def test_fail_no_schedule_and_no_config(self):
-    self.assertRaisesRegexp(ValueError, "Must specify a schedule",
-                            learn_runner.run, build_experiment, "/tmp")
+  def test_no_schedule_and_no_config_runs_local_run(self):
+    self.assertEqual(
+        "local_run",
+        learn_runner.run(build_experiment,
+                         output_dir="/tmp"))
+
+  def test_no_schedule_and_non_distributed_runs_local_run(self):
+    config = run_config.RunConfig(
+        cluster_spec=build_non_distributed_cluster_spec())
+    self.assertEqual(
+        "local_run",
+        learn_runner.run(lambda output_dir: TestExperiment(config=config),
+                         output_dir="/tmp"))
 
   def test_fail_job_name_with_no_default_schedule(self):
-    self.assertRaisesRegexp(ValueError, "Must specify a schedule",
-                            learn_runner.run, build_experiment, "/tmp")
+    config = run_config.RunConfig(
+        job_name="foo_has_no_default_schedule",
+        cluster_spec=build_distributed_cluster_spec())
+    create_experiment_fn = lambda output_dir: TestExperiment(config=config)
+    self.assertRaisesRegexp(ValueError, "No default schedule",
+                            learn_runner.run, create_experiment_fn, "/tmp")
 
   def test_fail_non_callable(self):
     self.assertRaisesRegexp(TypeError, "Experiment builder .* is not callable",
@@ -148,7 +183,8 @@ class MainTest(tf.test.TestCase):
                             "default")
 
   def test_fail_schedule_from_config_with_no_job_name(self):
-    config = run_config.RunConfig(job_name=None)
+    config = run_config.RunConfig(
+        job_name=None, cluster_spec=build_distributed_cluster_spec())
     self.assertRaisesRegexp(
         ValueError,
         "Must specify a schedule",
diff --git a/tensorflow/contrib/linear_optimizer/kernels/sdca_ops_test.cc b/tensorflow/contrib/linear_optimizer/kernels/sdca_ops_test.cc
index 89e56ac8089..dcdc9ad1cf2 100644
--- a/tensorflow/contrib/linear_optimizer/kernels/sdca_ops_test.cc
+++ b/tensorflow/contrib/linear_optimizer/kernels/sdca_ops_test.cc
@@ -77,13 +77,6 @@ Node* Ones(Graph* const g, const int n) {
   return test::graph::Constant(g, data);
 }
 
-Node* StringIota(Graph* const g, const int n) {
-  Tensor data(DT_STRING, TensorShape({n}));
-  test::FillFn<string>(
-      &data, [](const int i) { return strings::StrCat(strings::Hex(i)); });
-  return test::graph::Constant(g, data);
-}
-
 Node* SparseExampleIndices(Graph* const g, const int sparse_features_per_group,
                            const int num_examples) {
   const int x_size = num_examples * 4;
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index a365a99004e..677326c548b 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -60,9 +60,6 @@ HOST_OBJDIR := $(MAKEFILE_DIR)/gen/host_obj/
 HOST_BINDIR := $(MAKEFILE_DIR)/gen/host_bin/
 HOST_GENDIR := $(MAKEFILE_DIR)/gen/host_obj/
 
-# Find the current Eigen version from the Bazel configuration
-EIGEN_VERSION := $(shell grep eigen_version tensorflow/workspace.bzl | head -1 | sed -e 's/.*eigen_version.*=.*"\(.*\)"/\1/')
-
 # Settings for the host compiler.
 HOST_CXX := $(CC_PREFIX) gcc
 HOST_CXXFLAGS := --std=c++11
@@ -75,7 +72,7 @@ HOST_LDOPTS += -L/usr/local/lib
 HOST_INCLUDES := \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
--I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
+-I$(MAKEFILE_DIR)/downloads/eigen \
 -I$(HOST_GENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
@@ -148,7 +145,7 @@ DEPFLAGS = -MT $@ -MMD -MP -MF $(DEPDIR)/$*.Td
 INCLUDES := \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
--I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
+-I$(MAKEFILE_DIR)/downloads/eigen \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
@@ -240,7 +237,7 @@ ifeq ($(TARGET),ANDROID)
 -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
--I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
+-I$(MAKEFILE_DIR)/downloads/eigen \
 -I$(MAKEFILE_DIR)/gen/protobuf/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
@@ -570,6 +567,12 @@ clean:
 	rm -rf $(MAKEFILE_DIR)/gen
 	rm -rf tensorflow/core/util/version_info.cc
 
+# Gets rid of all generated files except protobuf libs generated
+# before calling make.  This allows users not to recompile proto libs everytime.
+clean_except_protobuf_libs:
+	find $(MAKEFILE_DIR)/gen -mindepth 1 -maxdepth 1 ! -name "protobuf" ! -name "protobuf-host" -exec rm -r "{}" \;
+	rm -rf tensorflow/core/util/version_info.cc
+
 # Gets rid of target files only, leaving the host alone. Also leaves the lib
 # directory untouched deliberately, so we can persist multiple architectures
 # across builds for iOS.
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 7d705021a89..bf36be23f7c 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -46,18 +46,19 @@ shift $((OPTIND - 1))
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd ${SCRIPT_DIR}/../../../
 
-# Remove any old files first.
-make -f tensorflow/contrib/makefile/Makefile clean
-
 if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
+  # Remove any old files first.
+  make -f tensorflow/contrib/makefile/Makefile clean
   rm -rf tensorflow/contrib/makefile/downloads
   # Pull down the required versions of the frameworks we need.
   tensorflow/contrib/makefile/download_dependencies.sh
-fi
-
-# Compile protobuf for the target Android device architectures.
+  # Compile protobuf for the target Android device architectures.
   CC_PREFIX="${CC_PREFIX}" NDK_ROOT="${NDK_ROOT}" \
 tensorflow/contrib/makefile/compile_android_protobuf.sh -c
+else
+  # Only clean files generated by make
+  make -f tensorflow/contrib/makefile/Makefile clean_except_protobuf_libs
+fi
 
 if [[ "${USE_HEXAGON}" == "true" ]]; then
     HEXAGON_PARENT_DIR=$(cd ../hexagon && pwd)
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index e1450210659..3fc841edd06 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -1,4 +1,4 @@
-#!/bin/bash -ex
+#!/bin/bash
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,57 +14,52 @@
 # limitations under the License.
 # ==============================================================================
 
+set -e
+
 DOWNLOADS_DIR=tensorflow/contrib/makefile/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
 
-mkdir -p ${DOWNLOADS_DIR}
+EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/.*tar\.gz' "${BZL_FILE_PATH}")"
+GEMMLOWP_URL="$(grep -o 'http.*github.com/google/gemmlowp/.*tar\.gz' "${BZL_FILE_PATH}")"
+GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
+PROTOBUF_URL="$(grep -o 'http.*github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}")"
+RE2_URL="$(grep -o 'http.*github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}")"
 
-# Grab the current Eigen version name from the Bazel build file
-EIGEN_HASH=$(cat "${BZL_FILE_PATH}" | egrep "eigen_version.*=.*\".*\"" | awk '{ print $3 }')
-# Trim trailing and preceding double quotes
-EIGEN_HASH="${EIGEN_HASH%\"}"
-EIGEN_HASH="${EIGEN_HASH#\"}"
-
-if [[ -z "${EIGEN_HASH}" ]]; then
-    echo >&2 "Eigen hash does not exist."
-    exit 1
-else
-    echo "Eigen hash = ${EIGEN_HASH}"
-fi
-
-curl "https://bitbucket.org/eigen/eigen/get/${EIGEN_HASH}.tar.gz" \
--o /tmp/eigen-${EIGEN_HASH}.tar.gz
-tar xzf /tmp/eigen-${EIGEN_HASH}.tar.gz -C ${DOWNLOADS_DIR}
-
-# Link to the downloaded Eigen library from a permanent directory name, since
-# the downloaded name changes with every version.
-cd ${DOWNLOADS_DIR}
-rm -rf eigen-latest
-ln -s eigen-eigen-${EIGEN_HASH} eigen-latest
-
-# TODO(petewarden) - Some new code in Eigen triggers a clang bug with iOS arm64,
-# so work around it by patching the source.
-function replace_by_sed() {
+# TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
+#                   so work around it by patching the source.
+replace_by_sed() {
+  local regex="${1}"
+  shift
   if echo "${OSTYPE}" | grep -q darwin; then
-    sed -e "$1" -i '' "$2"
+    sed -i '' -e "${regex}" "$@"
   else
-    sed -e "$1" -i "$2"
+    sed -i -e "${regex}" "$@"
   fi
 }
+
+download_and_extract() {
+  local usage="Usage: download_and_extract URL DIR"
+  local url="${1:?${usage}}"
+  local dir="${2:?${usage}}"
+  echo "downloading ${url}" >&2
+  mkdir -p "${dir}"
+  tar -C "${dir}" --strip-components=1 -xz < <(curl -Ls "${url}")
+}
+
+download_and_extract "${EIGEN_URL}" "${DOWNLOADS_DIR}/eigen"
+download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
+download_and_extract "${GOOGLETEST_URL}" "${DOWNLOADS_DIR}/googletest"
+download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf"
+download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2"
+
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
-eigen-latest/Eigen/src/Core/arch/NEON/Complex.h
+  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#static uint32x2_t p2ui_CONJ_XOR;// = vld1_u32( conj_XOR_DATA ); - Removed by scripts#' \
-eigen-latest/Eigen/src/Core/arch/NEON/Complex.h
+  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \
-eigen-latest/Eigen/src/Core/arch/NEON/Complex.h
-
-git clone https://github.com/google/re2.git re2
-git clone https://github.com/google/gemmlowp.git gemmlowp
-git clone https://github.com/google/protobuf.git protobuf
-git clone https://github.com/google/googletest.git googletest
-
+  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 # TODO(satok): Remove this once protobuf/autogen.sh is fixed.
 replace_by_sed 's#https://googlemock.googlecode.com/files/gmock-1.7.0.zip#http://download.tensorflow.org/deps/gmock-1.7.0.zip#' \
-protobuf/autogen.sh
+  "${DOWNLOADS_DIR}/protobuf/autogen.sh"
 
-echo "download_dependencies.sh completed successfully."
+echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index 3f8c3c899c9..f84c70250ec 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -35,7 +35,6 @@ tensorflow/core/lib/io/record_writer.cc
 tensorflow/core/lib/io/record_reader.cc
 tensorflow/core/lib/io/random_inputstream.cc
 tensorflow/core/lib/io/path.cc
-tensorflow/core/lib/io/match.cc
 tensorflow/core/lib/io/iterator.cc
 tensorflow/core/lib/io/inputstream_interface.cc
 tensorflow/core/lib/io/inputbuffer.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
index b6028734392..8645e4205a9 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
@@ -8,6 +8,7 @@ tensorflow/core/protobuf/queue_runner.pb.h
 tensorflow/core/protobuf/named_tensor.pb.h
 tensorflow/core/protobuf/meta_graph.pb.h
 tensorflow/core/protobuf/config.pb.h
+tensorflow/core/protobuf/tensor_bundle.pb.h
 tensorflow/core/lib/core/error_codes.pb.h
 tensorflow/core/framework/versions.pb.h
 tensorflow/core/framework/variable.pb.h
diff --git a/tensorflow/contrib/makefile/tf_pb_text_files.txt b/tensorflow/contrib/makefile/tf_pb_text_files.txt
index 9db31e5d235..815689aab61 100644
--- a/tensorflow/contrib/makefile/tf_pb_text_files.txt
+++ b/tensorflow/contrib/makefile/tf_pb_text_files.txt
@@ -2,6 +2,7 @@ tensorflow/core/util/saved_tensor_slice.pb_text.cc
 tensorflow/core/util/memmapped_file_system.pb_text.cc
 tensorflow/core/protobuf/saver.pb_text.cc
 tensorflow/core/protobuf/config.pb_text.cc
+tensorflow/core/protobuf/tensor_bundle.pb_text.cc
 tensorflow/core/lib/core/error_codes.pb_text.cc
 tensorflow/core/framework/versions.pb_text.cc
 tensorflow/core/framework/types.pb_text.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index 1b85c50060d..74675ac9bf3 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -8,6 +8,7 @@ tensorflow/core/protobuf/queue_runner.proto
 tensorflow/core/protobuf/named_tensor.proto
 tensorflow/core/protobuf/meta_graph.proto
 tensorflow/core/protobuf/config.proto
+tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/lib/core/error_codes.proto
 tensorflow/core/framework/versions.proto
 tensorflow/core/framework/variable.proto
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index a4d72cd901a..396f85d434f 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -118,6 +118,7 @@ time.
 @@streaming_mean_cosine_distance
 @@streaming_percentage_less
 @@streaming_sensitivity_at_specificity
+@@streaming_sparse_average_precision_at_k
 @@streaming_sparse_precision_at_k
 @@streaming_sparse_recall_at_k
 @@streaming_specificity_at_sensitivity
@@ -167,6 +168,7 @@ from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_recall_at
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_recall_at_thresholds
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_root_mean_squared_error
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_sensitivity_at_specificity
+from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_sparse_average_precision_at_k
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_sparse_precision_at_k
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_sparse_recall_at_k
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_specificity_at_sensitivity
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index be91554b2ef..120ad941fa4 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -31,13 +31,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.util.all_util import make_all
 
 
 IGNORE_MASK_DATE = '2016-10-19'
@@ -92,6 +92,27 @@ def _safe_div(numerator, denominator, name):
       name=name)
 
 
+def _safe_scalar_div(numerator, denominator, name):
+  """Divides two values, returning 0 if the denominator is != 0.
+
+  Args:
+    numerator: A scalar `float64` `Tensor`.
+    denominator: A scalar `float64` `Tensor`.
+    name: Name for the returned op.
+
+  Returns:
+    0 if `denominator` == 0, else `numerator` / `denominator`
+  """
+  numerator.get_shape().with_rank_at_most(1)
+  denominator.get_shape().with_rank_at_most(1)
+  return control_flow_ops.cond(
+      math_ops.equal(
+          array_ops.constant(0.0, dtype=dtypes.float64), denominator),
+      lambda: array_ops.constant(0.0, dtype=dtypes.float64),
+      lambda: math_ops.div(numerator, denominator),
+      name=name)
+
+
 def _create_local(name, shape=None, collections=None, dtype=dtypes.float32):
   """Creates a new local variable.
 
@@ -122,7 +143,7 @@ def _count_condition(values, weights=None, metrics_collections=None,
 
   Args:
     values: A `bool` `Tensor` of arbitrary size.
-    weights: An optional `Tensor` whose shape matches `values`.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
     metrics_collections: An optional list of collections that the metric
       value variable should be added to.
     updates_collections: An optional list of collections that the metric update
@@ -142,7 +163,6 @@ def _count_condition(values, weights=None, metrics_collections=None,
 
   values = math_ops.to_float(values)
   if weights is not None:
-    values.get_shape().assert_is_compatible_with(weights.get_shape())
     weights = math_ops.to_float(weights)
     values = math_ops.mul(values, weights)
 
@@ -158,46 +178,6 @@ def _count_condition(values, weights=None, metrics_collections=None,
   return value_tensor, update_op
 
 
-def _streaming_true_negatives(predictions, labels, weights=None,
-                              metrics_collections=None,
-                              updates_collections=None,
-                              name=None):
-  """Sum the weights of true_negatives.
-
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-  Args:
-    predictions: The predicted values, a `bool` `Tensor` of arbitrary
-      dimensions.
-    labels: The ground truth values, a `bool` `Tensor` whose dimensions must
-      match `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
-    metrics_collections: An optional list of collections that the metric
-      value variable should be added to.
-    updates_collections: An optional list of collections that the metric update
-      ops should be added to.
-    name: An optional variable_scope name.
-
-  Returns:
-    value_tensor: A tensor representing the current value of the metric.
-    update_op: An operation that accumulates the error from a batch of data.
-
-  Raises:
-    ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `weights` is not `None` and its shape doesn't match `predictions`, or if
-      either `metrics_collections` or `updates_collections` are not a list or
-      tuple.
-  """
-  with variable_scope.variable_scope(
-      [predictions, labels], name, 'true_negatives'):
-
-    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    is_true_negative = math_ops.logical_and(math_ops.equal(labels, 0),
-                                            math_ops.equal(predictions, 0))
-    return _count_condition(is_true_negative, weights, metrics_collections,
-                            updates_collections)
-
-
 def _streaming_true_positives(predictions, labels, weights=None,
                               metrics_collections=None,
                               updates_collections=None,
@@ -211,7 +191,7 @@ def _streaming_true_positives(predictions, labels, weights=None,
       dimensions.
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that the metric
       value variable should be added to.
     updates_collections: An optional list of collections that the metric update
@@ -251,7 +231,7 @@ def _streaming_false_positives(predictions, labels, weights=None,
       dimensions.
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that the metric
       value variable should be added to.
     updates_collections: An optional list of collections that the metric update
@@ -291,7 +271,7 @@ def _streaming_false_negatives(predictions, labels, weights=None,
       dimensions.
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that the metric
       value variable should be added to.
     updates_collections: An optional list of collections that the metric update
@@ -317,6 +297,31 @@ def _streaming_false_negatives(predictions, labels, weights=None,
                             updates_collections)
 
 
+def _broadcast_weights(weights, values):
+  """Broadcast `weights` to the same shape as `values`.
+
+  This returns a version of `weights` following the same broadcast rules as
+  `mul(weights, values)`. When computing a weighted average, use this function
+  to broadcast `weights` before summing them; e.g.,
+  `reduce_sum(w * v) / reduce_sum(_broadcast_weights(w, v))`.
+
+  Args:
+    weights: `Tensor` whose shape is broadcastable to `values`.
+    values: `Tensor` of any shape.
+
+  Returns:
+    `weights` broadcast to `values` shape.
+  """
+  weights_shape = weights.get_shape()
+  values_shape = values.get_shape()
+  if (weights_shape.is_fully_defined() and
+      values_shape.is_fully_defined() and
+      weights_shape.is_compatible_with(values_shape)):
+    return weights
+  return math_ops.mul(
+      weights, array_ops.ones_like(values), name='broadcast_weights')
+
+
 def streaming_mean(values, weights=None, metrics_collections=None,
                    updates_collections=None, name=None):
   """Computes the (weighted) mean of the given values.
@@ -335,7 +340,7 @@ def streaming_mean(values, weights=None, metrics_collections=None,
 
   Args:
     values: A `Tensor` of arbitrary dimensions.
-    weights: An optional `Tensor` whose shape matches `values`.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
     metrics_collections: An optional list of collections that `mean`
       should be added to.
     updates_collections: An optional list of collections that `update_op`
@@ -360,10 +365,9 @@ def streaming_mean(values, weights=None, metrics_collections=None,
     count = _create_local('count', shape=[])
 
     if weights is not None:
-      values.get_shape().assert_is_compatible_with(weights.get_shape())
       weights = math_ops.to_float(weights)
       values = math_ops.mul(values, weights)
-      num_values = math_ops.reduce_sum(weights)
+      num_values = math_ops.reduce_sum(_broadcast_weights(weights, values))
     else:
       num_values = math_ops.to_float(array_ops.size(values))
 
@@ -405,7 +409,7 @@ def streaming_mean_tensor(values, weights=None, metrics_collections=None,
 
   Args:
     values: A `Tensor` of arbitrary dimensions.
-    weights: An optional `Tensor` whose shape matches `values`.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
     metrics_collections: An optional list of collections that `mean`
       should be added to.
     updates_collections: An optional list of collections that `update_op`
@@ -427,13 +431,11 @@ def streaming_mean_tensor(values, weights=None, metrics_collections=None,
     total = _create_local('total_tensor', shape=values.get_shape())
     count = _create_local('count_tensor', shape=values.get_shape())
 
+    num_values = array_ops.ones_like(values)
     if weights is not None:
-      values.get_shape().assert_is_compatible_with(weights.get_shape())
       weights = math_ops.to_float(weights)
       values = math_ops.mul(values, weights)
-      num_values = weights
-    else:
-      num_values = array_ops.ones_like(values)
+      num_values = math_ops.mul(num_values, weights)
 
     total_compute_op = state_ops.assign_add(total, values)
     count_compute_op = state_ops.assign_add(count, num_values)
@@ -481,7 +483,7 @@ def streaming_accuracy(predictions, labels, weights=None,
     predictions: The predicted values, a `Tensor` of any shape.
     labels: The ground truth values, a `Tensor` whose shape matches
       `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `accuracy` should
       be added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -536,7 +538,7 @@ def streaming_precision(predictions, labels, ignore_mask=None, weights=None,
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `precision` should
       be added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -617,7 +619,7 @@ def streaming_recall(predictions, labels, ignore_mask=None, weights=None,
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall` should
       be added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -695,9 +697,10 @@ def _tp_fn_tn_fp(predictions, labels, thresholds, weights=None):
   Args:
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
-    labels: A `bool` `Tensor` whose shape matches `predictions`.
+    labels: A `Tensor` whose shape matches `predictions`. `labels` will be cast
+      to `bool`.
     thresholds: A python list or tuple of float thresholds in `[0, 1]`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
 
   Returns:
     true_positive: A variable of shape [len(thresholds)].
@@ -720,28 +723,29 @@ def _tp_fn_tn_fp(predictions, labels, thresholds, weights=None):
 
   num_thresholds = len(thresholds)
 
-  # Reshape predictions and labels
-  predictions = array_ops.reshape(predictions, [-1, 1])
-  labels = array_ops.reshape(math_ops.cast(labels, dtype=dtypes.bool), [1, -1])
+  # Reshape predictions and labels.
+  predictions_2d = array_ops.reshape(predictions, [-1, 1])
+  labels_2d = array_ops.reshape(
+      math_ops.cast(labels, dtype=dtypes.bool), [1, -1])
 
   # Use static shape if known.
-  num_predictions = predictions.get_shape().as_list()[0]
+  num_predictions = predictions_2d.get_shape().as_list()[0]
 
   # Otherwise use dynamic shape.
   if num_predictions is None:
-    num_predictions = array_ops.shape(predictions)[0]
+    num_predictions = array_ops.shape(predictions_2d)[0]
   thresh_tiled = array_ops.tile(
       array_ops.expand_dims(array_ops.constant(thresholds), [1]),
       array_ops.pack([1, num_predictions]))
 
   # Tile the predictions after thresholding them across different thresholds.
   pred_is_pos = math_ops.greater(
-      array_ops.tile(array_ops.transpose(predictions), [num_thresholds, 1]),
+      array_ops.tile(array_ops.transpose(predictions_2d), [num_thresholds, 1]),
       thresh_tiled)
   pred_is_neg = math_ops.logical_not(pred_is_pos)
 
   # Tile labels by number of thresholds
-  label_is_pos = array_ops.tile(labels, [num_thresholds, 1])
+  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
   label_is_neg = math_ops.logical_not(label_is_pos)
 
   true_positives = _create_local('true_positives', shape=[num_thresholds])
@@ -760,8 +764,8 @@ def _tp_fn_tn_fp(predictions, labels, thresholds, weights=None):
 
   if weights is not None:
     weights = math_ops.to_float(weights)
-    weights_tiled = array_ops.tile(
-        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
+    weights_tiled = array_ops.tile(array_ops.reshape(
+        _broadcast_weights(weights, predictions), [1, -1]), [num_thresholds, 1])
     thresh_tiled.get_shape().assert_is_compatible_with(
         weights_tiled.get_shape())
     is_true_positive *= weights_tiled
@@ -811,7 +815,7 @@ def streaming_auc(predictions, labels, weights=None, num_thresholds=200,
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
     labels: A `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     num_thresholds: The number of thresholds to use when discretizing the roc
       curve.
     metrics_collections: An optional list of collections that `auc` should be
@@ -906,7 +910,7 @@ def streaming_specificity_at_sensitivity(
       are in the range `[0, 1]`.
     labels: A `bool` `Tensor` whose shape matches `predictions`.
     sensitivity: A scalar value in range `[0, 1]`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     num_thresholds: The number of thresholds to use for matching the given
       sensitivity.
     metrics_collections: An optional list of collections that `specificity`
@@ -1010,7 +1014,7 @@ def streaming_sensitivity_at_specificity(
       are in the range `[0, 1]`.
     labels: A `bool` `Tensor` whose shape matches `predictions`.
     specificity: A scalar value in range `[0, 1]`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     num_thresholds: The number of thresholds to use for matching the given
       specificity.
     metrics_collections: An optional list of collections that `sensitivity`
@@ -1095,7 +1099,7 @@ def streaming_precision_at_thresholds(predictions, labels, thresholds,
       are in the range `[0, 1]`.
     labels: A `bool` `Tensor` whose shape matches `predictions`.
     thresholds: A python list or tuple of float thresholds in `[0, 1]`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `auc` should be
       added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -1167,7 +1171,7 @@ def streaming_recall_at_thresholds(predictions, labels, thresholds,
       are in the range `[0, 1]`.
     labels: A `bool` `Tensor` whose shape matches `predictions`.
     thresholds: A python list or tuple of float thresholds in `[0, 1]`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall` should be
       added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -1214,6 +1218,13 @@ def streaming_recall_at_thresholds(predictions, labels, thresholds,
     return recall, update_op
 
 
+def _at_k_name(name, k, class_id=None):
+  name = '%s_at_%d' % (name, k)
+  if class_id is not None:
+    name = '%s_class%d' % (name, class_id)
+  return name
+
+
 @deprecated_args(IGNORE_MASK_DATE, IGNORE_MASK_INSTRUCTIONS, 'ignore_mask')
 def streaming_recall_at_k(predictions, labels, k, ignore_mask=None,
                           weights=None, metrics_collections=None,
@@ -1243,7 +1254,7 @@ def streaming_recall_at_k(predictions, labels, k, ignore_mask=None,
       `int64`.
     k: The number of top elements to look at for computing recall.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall_at_k`
       should be added to.
     updates_collections: An optional list of collections `update_op` should be
@@ -1268,7 +1279,7 @@ def streaming_recall_at_k(predictions, labels, k, ignore_mask=None,
                         _mask_weights(ignore_mask, weights),
                         metrics_collections,
                         updates_collections,
-                        name or ('recall_at_%d' % k))
+                        name or _at_k_name('recall', k))
 
 
 # TODO(ptucker): Validate range of values in labels?
@@ -1347,11 +1358,8 @@ def streaming_sparse_recall_at_k(predictions,
       `predictions`, or if either `metrics_collections` or `updates_collections`
       are not a list or tuple.
   """
-  default_name = 'recall_at_%d' % k
-  if class_id is not None:
-    default_name = '%s_class%d' % (default_name, class_id)
-
-  with ops.name_scope(name, default_name, [predictions, labels]) as scope:
+  default_name = _at_k_name('recall', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
     _, top_k_idx = nn.top_k(predictions, k)
     top_k_idx = math_ops.to_int64(top_k_idx)
     weights = _mask_weights(ignore_mask, weights)
@@ -1450,10 +1458,8 @@ def streaming_sparse_precision_at_k(predictions,
       `predictions`, or if either `metrics_collections` or `updates_collections`
       are not a list or tuple.
   """
-  default_name = 'precision_at_%d' % k
-  if class_id is not None:
-    default_name = '%s_class%d' % (default_name, class_id)
-  with ops.name_scope(name, default_name, [predictions, labels]) as scope:
+  default_name = _at_k_name('precision', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
     _, top_k_idx = nn.top_k(predictions, k)
     top_k_idx = math_ops.to_int64(top_k_idx)
     weights = _mask_weights(ignore_mask, weights)
@@ -1474,6 +1480,288 @@ def streaming_sparse_precision_at_k(predictions,
     return metric, update
 
 
+def num_relevant(labels, k):
+  """Computes number of relevant values for each row in labels.
+
+  For labels with shape [D1, ... DN, num_labels], this is the minimum of
+  `num_labels` and `k`.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels].
+    k: Integer, k for @k metric.
+
+  Returns:
+    Integer `Tensor` of shape [D1, ... DN], where each value is the number of
+    relevant values for that row.
+
+  Raises:
+    ValueError: if inputs have invalid dtypes or values.
+  """
+  if k < 1:
+    raise ValueError('Invalid k=%s.' % k)
+  with ops.name_scope(None, 'num_relevant', (labels,)) as scope:
+    # For SparseTensor, calculate separate count for each row.
+    if isinstance(labels, (ops.SparseTensor, ops.SparseTensorValue)):
+      labels_sizes = set_ops.set_size(labels)
+      return math_ops.minimum(labels_sizes, k, name=scope)
+
+    # For dense Tensor, calculate scalar count based on last dimension, and
+    # tile across labels shape.
+    labels_shape = array_ops.shape(labels)
+    labels_size = labels_shape[-1]
+    num_relevant_scalar = math_ops.minimum(labels_size, k)
+    return array_ops.fill(labels_shape[0:-1], num_relevant_scalar, name=scope)
+
+
+def expand_and_tile(tensor, multiple, dim=0, name=None):
+  """Slice `tensor` shape in 2, then tile along the sliced dimension.
+
+  A new dimension is inserted in shape of `tensor` before `dim`, then values are
+  tiled `multiple` times along the new dimension.
+
+  Args:
+    tensor: Input `Tensor`.
+    multiple: Integer, number of times to tile.
+    dim: Integer, dimension along which to tile.
+    name: Name of operation.
+
+  Returns:
+    `Tensor` result of expanding and tiling `tensor`.
+
+  Raises:
+    ValueError: if `multiple` is less than 1, or `dim` is not in
+    `[-rank(tensor), rank(tensor)]`.
+  """
+  if multiple < 1:
+    raise ValueError('Invalid multiple %s, must be > 0.' % multiple)
+  with ops.name_scope(
+      name, 'expand_and_tile', (tensor, multiple, dim)) as scope:
+    # Sparse.
+    if isinstance(tensor, ops.SparseTensorValue):
+      tensor = ops.SparseTensor.from_value(tensor)
+    if isinstance(tensor, ops.SparseTensor):
+      if dim < 0:
+        expand_dims = array_ops.reshape(
+            array_ops.size(tensor.shape) + dim, [1])
+      else:
+        expand_dims = [dim]
+      expanded_shape = array_ops.concat(
+          0, (array_ops.slice(tensor.shape, [0], expand_dims), [1],
+              array_ops.slice(tensor.shape, expand_dims, [-1])),
+          name='expanded_shape')
+      expanded = sparse_ops.sparse_reshape(
+          tensor, shape=expanded_shape, name='expand')
+      if multiple == 1:
+        return expanded
+      return sparse_ops.sparse_concat(
+          dim - 1 if dim < 0 else dim, [expanded] * multiple, name=scope)
+
+    # Dense.
+    expanded = array_ops.expand_dims(
+        tensor, dim if (dim >= 0) else (dim - 1), name='expand')
+    if multiple == 1:
+      return expanded
+    ones = array_ops.ones_like(array_ops.shape(tensor))
+    tile_multiples = array_ops.concat(
+        0, (ones[:dim], (multiple,), ones[dim:]), name='multiples')
+    return array_ops.tile(expanded, tile_multiples, name=scope)
+
+
+def sparse_average_precision_at_k(predictions, labels, k):
+  """Computes average precision@k of predictions with respect to sparse labels.
+
+  From en.wikipedia.org/wiki/Information_retrieval#Average_precision, formula
+  for each row is:
+
+    AveP = sum_{i=1...k} P_{i} * rel_{i} / num_relevant_items
+
+  A "row" is the elements in dimension [D1, ... DN] of `predictions`, `labels`,
+  and the result `Tensors`. In the common case, this is [batch_size]. Each row
+  of the results contains the average precision for that row.
+
+  Internally, a `top_k` operation computes a `Tensor` indicating the top `k`
+  `predictions`. Set operations applied to `top_k` and `labels` calculate the
+  true positives, which are used to calculate the precision ("P_{i}" term,
+  above).
+
+  Args:
+    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
+      N >= 1. Commonly, N=1 and `predictions` has shape
+      [batch size, num_classes]. The final dimension contains the logit values
+      for each class. [D1, ... DN] must match `labels`.
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`. Values should be in range [0, num_classes], where
+      num_classes is the last dimension of `predictions`.
+    k: Integer, k for @k metric. This will calculate an average precision for
+      range `[1,k]`, as documented above.
+
+  Returns:
+    `float64` `Tensor` of shape [D1, ... DN], where each value is the average
+    precision for that row.
+
+  Raises:
+    ValueError: if k is invalid.
+  """
+  if k < 1:
+    raise ValueError('Invalid k=%s.' % k)
+  with ops.name_scope(
+      None, 'average_precision', (predictions, labels, k)) as scope:
+    # Calculate top k indices to produce [D1, ... DN, k] tensor.
+    _, predictions_idx = nn.top_k(predictions, k)
+    predictions_idx = math_ops.to_int64(predictions_idx, name='predictions_idx')
+
+    # Expand dims to produce [D1, ... DN, k, 1] tensor. This gives us a separate
+    # prediction for each k, so we can calculate separate true positive values
+    # for each k.
+    predictions_idx_per_k = array_ops.expand_dims(
+        predictions_idx, -1, name='predictions_idx_per_k')
+
+    # Replicate labels k times to produce [D1, ... DN, k, num_labels] tensor.
+    labels_per_k = expand_and_tile(
+        labels, multiple=k, dim=-1, name='labels_per_k')
+
+    # The following tensors are all of shape [D1, ... DN, k], containing values
+    # per row, per k value.
+    # `relevant_per_k` (int32) - Relevance indicator, 1 if the prediction at
+    #     that k value is correct, 0 otherwise. This is the "rel_{i}" term from
+    #     the formula above.
+    # `tp_per_k` (int32) - True positive counts.
+    # `retrieved_per_k` (int32) - Number of predicted values at each k. This is
+    #     the precision denominator.
+    # `precision_per_k` (float64) - Precision at each k. This is the "P_{i}"
+    #     term from the formula above.
+    # `relevant_precision_per_k` (float64) - Relevant precisions; i.e.,
+    #     precisions at all k for which relevance indicator is true.
+    relevant_per_k = _sparse_true_positive_at_k(
+        predictions_idx_per_k, labels_per_k, name='relevant_per_k')
+    tp_per_k = math_ops.cumsum(relevant_per_k, axis=-1, name='tp_per_k')
+    retrieved_per_k = math_ops.cumsum(
+        array_ops.ones_like(relevant_per_k), axis=-1, name='retrieved_per_k')
+    precision_per_k = math_ops.div(
+        math_ops.to_double(tp_per_k), math_ops.to_double(retrieved_per_k),
+        name='precision_per_k')
+    relevant_precision_per_k = math_ops.mul(
+        precision_per_k, math_ops.to_double(relevant_per_k),
+        name='relevant_precision_per_k')
+
+    # Reduce along k dimension to get the sum, yielding a [D1, ... DN] tensor.
+    precision_sum = math_ops.reduce_sum(
+        relevant_precision_per_k, reduction_indices=(-1,), name='precision_sum')
+
+    # Divide by number of relevant items to get average precision. These are
+    # the "num_relevant_items" and "AveP" terms from the formula above.
+    num_relevant_items = math_ops.to_double(num_relevant(labels, k))
+    return math_ops.div(precision_sum, num_relevant_items, name=scope)
+
+
+def streaming_sparse_average_precision_at_k(predictions,
+                                            labels,
+                                            k,
+                                            weights=None,
+                                            metrics_collections=None,
+                                            updates_collections=None,
+                                            name=None):
+  """Computes average precision@k of predictions with respect to sparse labels.
+
+  See `sparse_average_precision_at_k` for details on formula. `weights` are
+  applied to the result of `sparse_average_precision_at_k`
+
+  `streaming_sparse_average_precision_at_k` creates two local variables,
+  `average_precision_at_<k>/count` and `average_precision_at_<k>/total`, that
+  are used to compute the frequency. This frequency is ultimately returned as
+  `precision_at_<k>`: an idempotent operation that simply divides
+  `true_positive_at_<k>` by total (`true_positive_at_<k>` +
+  `false_positive_at_<k>`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+  indicating the top `k` `predictions`. Set operations applied to `top_k` and
+  `labels` calculate the true positives and false positives weighted by
+  `weights`. Then `update_op` increments `true_positive_at_<k>` and
+  `false_positive_at_<k>` using these values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    predictions: Float `Tensor` with shape [D1, ... DN, num_classes] where
+      N >= 1. Commonly, N=1 and `predictions` has shape
+      [batch size, num_classes]. The final dimension contains the logit values
+      for each class. [D1, ... DN] must match `labels`.
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`. Values should be in range [0, num_classes], where
+      num_classes is the last dimension of `predictions`.
+    k: Integer, k for @k metric. This will calculate an average precision for
+      range `[1,k]`, as documented above.
+    weights: An optional `Tensor` whose shape is broadcastable to the the first
+      [D1, ... DN] dimensions of `predictions` and `labels`.
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependant ops.
+
+  Returns:
+    mean_average_precision: Scalar `float64` `Tensor` with the mean average
+      precision values.
+    update: `Operation` that increments  variables appropriately, and whose
+      value matches `metric`.
+  """
+  default_name = _at_k_name('average_precision', k)
+  with ops.name_scope(name, default_name, (predictions, labels)) as scope:
+    # Calculate per-example average precision, and apply weights.
+    average_precision = sparse_average_precision_at_k(
+        predictions=predictions, labels=labels, k=k)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      average_precision = math_ops.mul(average_precision, weights)
+
+    # Create accumulation variables and update ops for max average precision and
+    # total average precision.
+    with ops.name_scope(None, 'max', (average_precision,)) as max_scope:
+      # `max` is the max possible precision. Since max for any row is 1.0:
+      # - For the unweighted case, this is just the number of rows.
+      # - For the weighted case, it's the sum of the weights broadcast across
+      #   `average_precision` rows.
+      max_var = contrib_variables.local_variable(
+          array_ops.zeros([], dtype=dtypes.float64), name=max_scope)
+      if weights is None:
+        batch_max = math_ops.to_double(
+            array_ops.size(average_precision, name='batch_max'))
+      else:
+        # TODO(ptucker): More efficient way to broadcast?
+        broadcast_weights = math_ops.mul(
+            weights, array_ops.ones_like(average_precision),
+            name='broadcast_weights')
+        batch_max = math_ops.reduce_sum(broadcast_weights, name='batch_max')
+      max_update = state_ops.assign_add(max_var, batch_max, name='update')
+    with ops.name_scope(None, 'total', (average_precision,)) as total_scope:
+      total_var = contrib_variables.local_variable(
+          array_ops.zeros([], dtype=dtypes.float64), name=total_scope)
+      batch_total = math_ops.reduce_sum(average_precision, name='batch_total')
+      total_update = state_ops.assign_add(total_var, batch_total, name='update')
+
+    # Divide total by max to get mean, for both vars and the update ops.
+    mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
+    update = _safe_scalar_div(total_update, max_update, name=scope)
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, mean_average_precision)
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update)
+
+    return mean_average_precision, update
+
+
 def _select_class_id(ids, selected_id):
   """Filter all but `selected_id` out of `ids`.
 
@@ -1486,7 +1774,7 @@ def _select_class_id(ids, selected_id):
     which might be smaller. This contains only the entries equal to
     `selected_id`.
   """
-  if isinstance(ids, ops.SparseTensor):
+  if isinstance(ids, (ops.SparseTensor, ops.SparseTensorValue)):
     return sparse_ops.sparse_retain(
         ids, math_ops.equal(ids.values, selected_id))
 
@@ -1515,7 +1803,8 @@ def _maybe_select_class_id(labels, predictions_idx, selected_id=None):
       has shape [batch_size, num_labels]. [D1, ... DN] must match
       `predictions_idx`.
     predictions_idx: `int64` `Tensor` of class IDs, with shape [D1, ... DN, k]
-      where N >= 1. Commonly, N=1 and predictions has shape [batch size, k].
+      where N >= 1. Commonly, N=1 and `predictions_idx` has shape
+      [batch size, k].
     selected_id: Int id to select.
 
   Returns:
@@ -1527,6 +1816,46 @@ def _maybe_select_class_id(labels, predictions_idx, selected_id=None):
           _select_class_id(predictions_idx, selected_id))
 
 
+def _sparse_true_positive_at_k(predictions_idx,
+                               labels,
+                               class_id=None,
+                               weights=None,
+                               name=None):
+  """Calculates true positives for recall@k and precision@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels_sparse`.
+
+  Args:
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+    name: Name of operation.
+
+  Returns:
+    A [D1, ... DN] `Tensor` of true positive counts.
+  """
+  with ops.name_scope(name, 'true_positives', (predictions_idx, labels)):
+    labels, predictions_idx = _maybe_select_class_id(
+        labels, predictions_idx, class_id)
+    tp = set_ops.set_size(set_ops.set_intersection(predictions_idx, labels))
+    tp = math_ops.to_double(tp)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      tp = math_ops.mul(tp, weights)
+    return tp
+
+
 def _streaming_sparse_true_positive_at_k(predictions_idx,
                                          labels,
                                          k,
@@ -1563,26 +1892,58 @@ def _streaming_sparse_true_positive_at_k(predictions_idx,
   Raises:
     ValueError: If `weights` is not `None` and has an incomptable shape.
   """
-  default_name = 'true_positive_at_%d' % k
-  if class_id is not None:
-    default_name = '%s_class%d' % (default_name, class_id)
-  with ops.name_scope(name, default_name, [predictions_idx, labels]) as scope:
-    labels, predictions_idx = _maybe_select_class_id(labels,
-                                                     predictions_idx,
-                                                     class_id)
-    tp = set_ops.set_size(set_ops.set_intersection(predictions_idx, labels))
-    tp = math_ops.to_double(tp)
-    if weights is not None:
-      tp.get_shape().assert_is_compatible_with(weights.get_shape())
-      weights = math_ops.to_double(weights)
-      tp = math_ops.mul(tp, weights)
-    batch_total_tp = math_ops.reduce_sum(tp)
+  default_name = _at_k_name('true_positive', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions_idx, labels)) as scope:
+    tp = _sparse_true_positive_at_k(
+        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        weights=weights)
+    batch_total_tp = math_ops.to_double(math_ops.reduce_sum(tp))
 
     var = contrib_variables.local_variable(
         array_ops.zeros([], dtype=dtypes.float64), name=scope)
     return var, state_ops.assign_add(var, batch_total_tp, name='update')
 
 
+def _sparse_false_positive_at_k(predictions_idx,
+                                labels,
+                                class_id=None,
+                                weights=None):
+  """Calculates false positives for precision@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels_sparse`.
+
+  Args:
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+
+  Returns:
+    A [D1, ... DN] `Tensor` of false positive counts.
+  """
+  with ops.name_scope(None, 'false_positives', (predictions_idx, labels)):
+    labels, predictions_idx = _maybe_select_class_id(labels,
+                                                     predictions_idx,
+                                                     class_id)
+    fp = set_ops.set_size(set_ops.set_difference(
+        predictions_idx, labels, aminusb=True))
+    fp = math_ops.to_double(fp)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      fp = math_ops.mul(fp, weights)
+    return fp
+
+
 def _streaming_sparse_false_positive_at_k(predictions_idx,
                                           labels,
                                           k,
@@ -1619,28 +1980,59 @@ def _streaming_sparse_false_positive_at_k(predictions_idx,
   Raises:
     ValueError: If `weights` is not `None` and has an incomptable shape.
   """
-  default_name = 'false_positive_at_%d' % k
-  if class_id is not None:
-    default_name = '%s_class%d' % (default_name, class_id)
-  with ops.name_scope(name, default_name, [predictions_idx, labels]) as scope:
-    labels, predictions_idx = _maybe_select_class_id(labels,
-                                                     predictions_idx,
-                                                     class_id)
-    fp = set_ops.set_size(set_ops.set_difference(predictions_idx,
-                                                 labels,
-                                                 aminusb=True))
-    fp = math_ops.to_double(fp)
-    if weights is not None:
-      fp.get_shape().assert_is_compatible_with(weights.get_shape())
-      weights = math_ops.to_double(weights)
-      fp = math_ops.mul(fp, weights)
-    batch_total_fp = math_ops.reduce_sum(fp)
+  default_name = _at_k_name('false_positive', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions_idx, labels)) as scope:
+    fp = _sparse_false_positive_at_k(
+        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        weights=weights)
+    batch_total_fp = math_ops.to_double(math_ops.reduce_sum(fp))
 
     var = contrib_variables.local_variable(
         array_ops.zeros([], dtype=dtypes.float64), name=scope)
     return var, state_ops.assign_add(var, batch_total_fp, name='update')
 
 
+def _sparse_false_negative_at_k(predictions_idx,
+                                labels,
+                                class_id=None,
+                                weights=None):
+  """Calculates false negatives for recall@k.
+
+  If `class_id` is specified, calculate binary true positives for `class_id`
+      only.
+  If `class_id` is not specified, calculate metrics for `k` predicted vs
+      `n` label classes, where `n` is the 2nd dimension of `labels_sparse`.
+
+  Args:
+    predictions_idx: 1-D or higher `int64` `Tensor` with last dimension `k`,
+      top `k` predicted classes. For rank `n`, the first `n-1` dimensions must
+      match `labels`.
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `predictions_idx`.
+    class_id: Class for which we want binary metrics.
+    weights: `Tensor` whose shape is broadcastable to the the first [D1, ... DN]
+      dimensions of `predictions_idx` and `labels`.
+
+  Returns:
+    A [D1, ... DN] `Tensor` of false negative counts.
+  """
+  with ops.name_scope(None, 'false_negatives', (predictions_idx, labels)):
+    labels, predictions_idx = _maybe_select_class_id(labels,
+                                                     predictions_idx,
+                                                     class_id)
+    fn = set_ops.set_size(set_ops.set_difference(predictions_idx,
+                                                 labels,
+                                                 aminusb=False))
+    fn = math_ops.to_double(fn)
+    if weights is not None:
+      weights = math_ops.to_double(weights)
+      fn = math_ops.mul(fn, weights)
+    return fn
+
+
 def _streaming_sparse_false_negative_at_k(predictions_idx,
                                           labels,
                                           k,
@@ -1677,22 +2069,12 @@ def _streaming_sparse_false_negative_at_k(predictions_idx,
   Raises:
     ValueError: If `weights` is not `None` and has an incomptable shape.
   """
-  default_name = 'false_negative_at_%d' % k
-  if class_id is not None:
-    default_name = '%s_class%d' % (default_name, class_id)
-  with ops.name_scope(name, default_name, [predictions_idx, labels]) as scope:
-    labels, predictions_idx = _maybe_select_class_id(labels,
-                                                     predictions_idx,
-                                                     class_id)
-    fn = set_ops.set_size(set_ops.set_difference(predictions_idx,
-                                                 labels,
-                                                 aminusb=False))
-    fn = math_ops.to_double(fn)
-    if weights is not None:
-      fn.get_shape().assert_is_compatible_with(weights.get_shape())
-      weights = math_ops.to_double(weights)
-      fn = math_ops.mul(fn, weights)
-    batch_total_fn = math_ops.reduce_sum(fn)
+  default_name = _at_k_name('false_negative', k, class_id=class_id)
+  with ops.name_scope(name, default_name, (predictions_idx, labels)) as scope:
+    fn = _sparse_false_negative_at_k(
+        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        weights=weights)
+    batch_total_fn = math_ops.to_double(math_ops.reduce_sum(fn))
 
     var = contrib_variables.local_variable(
         array_ops.zeros([], dtype=dtypes.float64), name=scope)
@@ -1724,7 +2106,7 @@ def streaming_mean_absolute_error(predictions, labels, weights=None,
   Args:
     predictions: A `Tensor` of arbitrary shape.
     labels: A `Tensor` of the same shape as `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that
       `mean_absolute_error` should be added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -1777,7 +2159,7 @@ def streaming_mean_relative_error(predictions, labels, normalizer, weights=None,
     predictions: A `Tensor` of arbitrary shape.
     labels: A `Tensor` of the same shape as `predictions`.
     normalizer: A `Tensor` of the same shape as `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that
       `mean_relative_error` should be added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -1836,7 +2218,7 @@ def streaming_mean_squared_error(predictions, labels, weights=None,
   Args:
     predictions: A `Tensor` of arbitrary shape.
     labels: A `Tensor` of the same shape as `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that
       `mean_squared_error` should be added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -1888,7 +2270,7 @@ def streaming_root_mean_squared_error(predictions, labels, weights=None,
   Args:
     predictions: A `Tensor` of arbitrary shape.
     labels: A `Tensor` of the same shape as `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that
       `root_mean_squared_error` should be added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -1992,7 +2374,8 @@ def streaming_covariance(predictions,
       weighted_predictions = predictions
       weighted_labels = labels
     else:
-      batch_count = math_ops.reduce_sum(weights)  # n_B in update equation
+      batch_count = math_ops.reduce_sum(
+          _broadcast_weights(weights, labels))  # n_B in eqn
       weighted_predictions = predictions * weights
       weighted_labels = labels * weights
 
@@ -2095,9 +2478,9 @@ def streaming_pearson_correlation(predictions,
     update_op: An operation that updates the underlying variables appropriately.
 
   Raises:
-    ValueError: If labels and predictions are of different sizes or if the
-      ignore_mask is of the wrong size or if either `metrics_collections` or
-      `updates_collections` are not a list or tuple.
+    ValueError: If `labels` and `predictions` are of different sizes, or if
+      `weights` is the wrong size, or if either `metrics_collections` or
+      `updates_collections` are not a `list` or `tuple`.
   """
   with variable_scope.variable_scope(name, 'pearson_r', [predictions, labels]):
     predictions, labels = metric_ops_util.remove_squeezable_dimensions(
@@ -2153,8 +2536,8 @@ def streaming_mean_cosine_distance(predictions, labels, dim, weights=None,
     predictions: A `Tensor` of the same shape as `labels`.
     labels: A `Tensor` of arbitrary shape.
     dim: The dimension along which the cosine distance is computed.
-    weights: An optional `Tensor` whose shape matches `predictions`, and whose
-      dimension `dim` is 1.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`,
+      and whose dimension `dim` is 1.
     metrics_collections: An optional list of collections that the metric
       value variable should be added to.
     updates_collections: An optional list of collections that the metric update
@@ -2221,7 +2604,7 @@ def streaming_percentage_less(values, threshold, ignore_mask=None, weights=None,
     values: A numeric `Tensor` of arbitrary size.
     threshold: A scalar threshold.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `values`.
-    weights: An optional `Tensor` whose shape matches `values`.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
     metrics_collections: An optional list of collections that the metric
       value variable should be added to.
     updates_collections: An optional list of collections that the metric update
@@ -2283,7 +2666,7 @@ def streaming_mean_iou(predictions,
       have. This value must be provided, since a confusion matrix of
       dimension = [num_classes, num_classes] will be allocated.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `mean_iou`
       should be added to.
     updates_collections: An optional list of collections `update_op` should be
@@ -2411,4 +2794,28 @@ def aggregate_metric_map(names_to_tuples):
   return dict(zip(metric_names, value_ops)), dict(zip(metric_names, update_ops))
 
 
-__all__ = make_all(__name__)
+__all__ = [
+    'aggregate_metric_map',
+    'aggregate_metrics',
+    'streaming_accuracy',
+    'streaming_auc',
+    'streaming_mean',
+    'streaming_mean_absolute_error',
+    'streaming_mean_cosine_distance',
+    'streaming_mean_iou',
+    'streaming_mean_relative_error',
+    'streaming_mean_squared_error',
+    'streaming_mean_tensor',
+    'streaming_percentage_less',
+    'streaming_precision',
+    'streaming_precision_at_thresholds',
+    'streaming_recall',
+    'streaming_recall_at_k',
+    'streaming_recall_at_thresholds',
+    'streaming_root_mean_squared_error',
+    'streaming_sensitivity_at_specificity',
+    'streaming_sparse_average_precision_at_k',
+    'streaming_sparse_precision_at_k',
+    'streaming_sparse_recall_at_k',
+    'streaming_specificity_at_sensitivity',
+]
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 9048f12b52f..2f33f4b648c 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -23,6 +23,7 @@ import math
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
+from tensorflow.contrib.metrics.python.ops import metric_ops
 
 NAN = float('nan')
 
@@ -82,10 +83,7 @@ def _binary_2d_label_to_sparse(labels):
     `SparseTensor` whose values are indices along the last dimension of
     `labels`.
   """
-  v = _binary_2d_label_to_sparse_value(labels)
-  return tf.SparseTensor(tf.constant(v.indices, tf.int64),
-                         tf.constant(v.values, tf.int64),
-                         tf.constant(v.shape, tf.int64))
+  return tf.SparseTensor.from_value(_binary_2d_label_to_sparse_value(labels))
 
 
 def _binary_3d_label_to_sparse_value(labels):
@@ -131,10 +129,7 @@ def _binary_3d_label_to_sparse(labels):
     `SparseTensor` whose values are indices along the last dimension of
     `labels`.
   """
-  v = _binary_3d_label_to_sparse_value(labels)
-  return tf.SparseTensor(tf.constant(v.indices, tf.int64),
-                         tf.constant(v.values, tf.int64),
-                         tf.constant(v.shape, tf.int64))
+  return tf.SparseTensor.from_value(_binary_3d_label_to_sparse_value(labels))
 
 
 class StreamingMeanTest(tf.test.TestCase):
@@ -192,7 +187,58 @@ class StreamingMeanTest(tf.test.TestCase):
 
       self.assertAlmostEqual(1.65, sess.run(mean), 5)
 
-  def testWeightedValues(self):
+  def test1dWeightedValues(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [1])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.streaming_mean(values, weights)
+
+      tf.initialize_local_variables().run()
+      for _ in range(4):
+        update_op.eval()
+      self.assertAlmostEqual((0 + 1 - 3.2 + 4.0) / 4.0, mean.eval(), 5)
+
+  def test1dWeightedValues_placeholders(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      feed_values = (
+          (0, 1),
+          (-4.2, 9.1),
+          (6.5, 0),
+          (-3.2, 4.0)
+      )
+      values = tf.placeholder(dtype=tf.float32)
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [1])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [0])
+      _enqueue_vector(sess, weights_queue, [1])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.streaming_mean(values, weights)
+
+      tf.initialize_local_variables().run()
+      for i in range(4):
+        update_op.eval(feed_dict={values: feed_values[i]})
+      self.assertAlmostEqual((0 + 1 - 3.2 + 4.0) / 4.0, mean.eval(), 5)
+
+  def test2dWeightedValues(self):
     with self.test_session() as sess:
       # Create the queue that populates the values.
       values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
@@ -212,10 +258,36 @@ class StreamingMeanTest(tf.test.TestCase):
 
       mean, update_op = metrics.streaming_mean(values, weights)
 
-      sess.run(tf.initialize_local_variables())
+      tf.initialize_local_variables().run()
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAlmostEqual(-0.8, sess.run(mean), 5)
+        update_op.eval()
+      self.assertAlmostEqual((0 + 1 - 4.2 + 0) / 4.0, mean.eval(), 5)
+
+  def test2dWeightedValues_placeholders(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      feed_values = (
+          (0, 1),
+          (-4.2, 9.1),
+          (6.5, 0),
+          (-3.2, 4.0)
+      )
+      values = tf.placeholder(dtype=tf.float32)
+
+      # Create the queue that populates the weighted labels.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, weights_queue, [1, 1])
+      _enqueue_vector(sess, weights_queue, [1, 0])
+      _enqueue_vector(sess, weights_queue, [0, 1])
+      _enqueue_vector(sess, weights_queue, [0, 0])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.streaming_mean(values, weights)
+
+      tf.initialize_local_variables().run()
+      for i in range(4):
+        update_op.eval(feed_dict={values: feed_values[i]})
+      self.assertAlmostEqual((0 + 1 - 4.2 + 0) / 4.0, mean.eval(), 5)
 
 
 class StreamingMeanTensorTest(tf.test.TestCase):
@@ -294,7 +366,32 @@ class StreamingMeanTensorTest(tf.test.TestCase):
 
       self.assertAllClose([[-0.9/4., 3.525]], sess.run(mean), 5)
 
-  def testWeighted1(self):
+  def testWeighted1d(self):
+    with self.test_session() as sess:
+      # Create the queue that populates the values.
+      values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
+      _enqueue_vector(sess, values_queue, [0, 1])
+      _enqueue_vector(sess, values_queue, [-4.2, 9.1])
+      _enqueue_vector(sess, values_queue, [6.5, 0])
+      _enqueue_vector(sess, values_queue, [-3.2, 4.0])
+      values = values_queue.dequeue()
+
+      # Create the queue that populates the weights.
+      weights_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 1))
+      _enqueue_vector(sess, weights_queue, [[1]])
+      _enqueue_vector(sess, weights_queue, [[0]])
+      _enqueue_vector(sess, weights_queue, [[1]])
+      _enqueue_vector(sess, weights_queue, [[0]])
+      weights = weights_queue.dequeue()
+
+      mean, update_op = metrics.streaming_mean_tensor(values, weights)
+
+      sess.run(tf.initialize_local_variables())
+      for _ in range(4):
+        sess.run(update_op)
+      self.assertAllClose([[3.25, 0.5]], sess.run(mean), 5)
+
+  def testWeighted2d_1(self):
     with self.test_session() as sess:
       # Create the queue that populates the values.
       values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
@@ -319,7 +416,7 @@ class StreamingMeanTensorTest(tf.test.TestCase):
         sess.run(update_op)
       self.assertAllClose([[-2.1, 0.5]], sess.run(mean), 5)
 
-  def testWeightedValues2(self):
+  def testWeighted2d_2(self):
     with self.test_session() as sess:
       # Create the queue that populates the values.
       values_queue = tf.FIFOQueue(4, dtypes=tf.float32, shapes=(1, 2))
@@ -547,17 +644,73 @@ class StreamingPrecisionTest(tf.test.TestCase):
       self.assertAlmostEqual(0.5, update_op.eval())
       self.assertAlmostEqual(0.5, precision.eval())
 
-  def testWeighted(self):
-    predictions = tf.constant([1, 0, 1, 0], shape=(1, 4))
-    labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
-    weights = tf.constant([1, 2, 3, 4], shape=(1, 4))
+  def testWeighted1d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
     precision, update_op = metrics.streaming_precision(
-        predictions, labels, weights=weights)
+        predictions, labels, weights=tf.constant([[2], [5]]))
 
-    with self.test_session() as sess:
-      sess.run(tf.initialize_local_variables())
-      self.assertAlmostEqual(0.75, update_op.eval())
-      self.assertAlmostEqual(0.75, precision.eval())
+    with self.test_session():
+      tf.initialize_local_variables().run()
+      weighted_tp = 2.0 + 5.0
+      weighted_positives = (2.0 + 2.0) + (5.0 + 5.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, precision.eval())
+
+  def testWeighted1d_placeholders(self):
+    predictions = tf.placeholder(dtype=tf.float32)
+    labels = tf.placeholder(dtype=tf.float32)
+    feed_dict = {
+        predictions: ((1, 0, 1, 0), (1, 0, 1, 0)),
+        labels: ((0, 1, 1, 0), (1, 0, 0, 1))
+    }
+    precision, update_op = metrics.streaming_precision(
+        predictions, labels, weights=tf.constant([[2], [5]]))
+
+    with self.test_session():
+      tf.initialize_local_variables().run()
+      weighted_tp = 2.0 + 5.0
+      weighted_positives = (2.0 + 2.0) + (5.0 + 5.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(
+          expected_precision, update_op.eval(feed_dict=feed_dict))
+      self.assertAlmostEqual(
+          expected_precision, precision.eval(feed_dict=feed_dict))
+
+  def testWeighted2d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    precision, update_op = metrics.streaming_precision(
+        predictions, labels, weights=tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+
+    with self.test_session():
+      tf.initialize_local_variables().run()
+      weighted_tp = 3.0 + 4.0
+      weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, precision.eval())
+
+  def testWeighted2d_placeholders(self):
+    predictions = tf.placeholder(dtype=tf.float32)
+    labels = tf.placeholder(dtype=tf.float32)
+    feed_dict = {
+        predictions: ((1, 0, 1, 0), (1, 0, 1, 0)),
+        labels: ((0, 1, 1, 0), (1, 0, 0, 1))
+    }
+    precision, update_op = metrics.streaming_precision(
+        predictions, labels, weights=tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+
+    with self.test_session():
+      tf.initialize_local_variables().run()
+      weighted_tp = 3.0 + 4.0
+      weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+      expected_precision = weighted_tp / weighted_positives
+      self.assertAlmostEqual(
+          expected_precision, update_op.eval(feed_dict=feed_dict))
+      self.assertAlmostEqual(
+          expected_precision, precision.eval(feed_dict=feed_dict))
 
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
@@ -658,17 +811,35 @@ class StreamingRecallTest(tf.test.TestCase):
       self.assertAlmostEqual(0.5, update_op.eval())
       self.assertAlmostEqual(0.5, recall.eval())
 
-  def testWeighted(self):
-    predictions = tf.constant([1, 0, 1, 0], shape=(1, 4))
-    labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
-    weights = tf.constant([1, 2, 3, 4], shape=(1, 4))
+  def testWeighted1d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    weights = tf.constant([[2], [5]])
     recall, update_op = metrics.streaming_recall(
         predictions, labels, weights=weights)
 
     with self.test_session() as sess:
       sess.run(tf.initialize_local_variables())
-      self.assertAlmostEqual(0.6, update_op.eval())
-      self.assertAlmostEqual(0.6, recall.eval())
+      weighted_tp = 2.0 + 5.0
+      weighted_t = (2.0 + 2.0) + (5.0 + 5.0)
+      expected_precision = weighted_tp / weighted_t
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, recall.eval())
+
+  def testWeighted2d(self):
+    predictions = tf.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    labels = tf.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    weights = tf.constant([[1, 2, 3, 4], [4, 3, 2, 1]])
+    recall, update_op = metrics.streaming_recall(
+        predictions, labels, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(tf.initialize_local_variables())
+      weighted_tp = 3.0 + 1.0
+      weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
+      expected_precision = weighted_tp / weighted_t
+      self.assertAlmostEqual(expected_precision, update_op.eval())
+      self.assertAlmostEqual(expected_precision, recall.eval())
 
   def testAllIncorrect(self):
     np_inputs = np.random.randint(0, 2, size=(100, 1))
@@ -760,7 +931,20 @@ class StreamingAUCTest(tf.test.TestCase):
 
       self.assertAlmostEqual(0.5, auc.eval())
 
-  def testWeighted(self):
+  def testWeighted1d(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
+      labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
+      weights = tf.constant([2], shape=(1, 1))
+      auc, update_op = metrics.streaming_auc(predictions, labels,
+                                             weights=weights)
+
+      sess.run(tf.initialize_local_variables())
+      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
+
+      self.assertAlmostEqual(0.5, auc.eval(), 5)
+
+  def testWeighted2d(self):
     with self.test_session() as sess:
       predictions = tf.constant([1, 0, 1, 0], shape=(1, 4), dtype=tf.float32)
       labels = tf.constant([0, 1, 1, 0], shape=(1, 4))
@@ -1008,7 +1192,25 @@ class StreamingSpecificityAtSensitivityTest(tf.test.TestCase):
       self.assertAlmostEqual(0.6, sess.run(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
-  def testWeighted(self):
+  def testWeighted1d(self):
+    predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0,
+                          0.1, 0.2, 0.2, 0.26, 0.26]
+    labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weights_values = [3]
+
+    predictions = tf.constant(predictions_values, dtype=tf.float32)
+    labels = tf.constant(labels_values)
+    weights = tf.constant(weights_values)
+    specificity, update_op = metrics.streaming_specificity_at_sensitivity(
+        predictions, labels, weights=weights, sensitivity=0.4)
+
+    with self.test_session() as sess:
+      sess.run(tf.initialize_local_variables())
+
+      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, specificity.eval())
+
+  def testWeighted2d(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0,
                           0.1, 0.2, 0.2, 0.26, 0.26]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1240,7 +1442,34 @@ class StreamingPrecisionRecallThresholdsTest(tf.test.TestCase):
       self.assertAlmostEqual(0, prec.eval())
       self.assertAlmostEqual(0, rec.eval())
 
-  def testWeights(self):
+  def testWeights1d(self):
+    with self.test_session() as sess:
+      predictions = tf.constant([[1, 0], [1, 0]], shape=(2, 2),
+                                dtype=tf.float32)
+      labels = tf.constant([[0, 1], [1, 0]], shape=(2, 2))
+      weights = tf.constant([[0], [1]], shape=(2, 1), dtype=tf.float32)
+      thresholds = [0.5, 1.1]
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds, weights=weights)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds, weights=weights)
+
+      [prec_low, prec_high] = tf.split(0, 2, prec)
+      prec_low = tf.reshape(prec_low, shape=())
+      prec_high = tf.reshape(prec_high, shape=())
+      [rec_low, rec_high] = tf.split(0, 2, rec)
+      rec_low = tf.reshape(rec_low, shape=())
+      rec_high = tf.reshape(rec_high, shape=())
+
+      sess.run(tf.initialize_local_variables())
+      sess.run([prec_op, rec_op])
+
+      self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
+      self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
+      self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
+      self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
+
+  def testWeights2d(self):
     with self.test_session() as sess:
       predictions = tf.constant([[1, 0], [1, 0]], shape=(2, 2),
                                 dtype=tf.float32)
@@ -1465,61 +1694,183 @@ class StreamingRecallAtKTest(tf.test.TestCase):
 
 class StreamingSparsePrecisionTest(tf.test.TestCase):
 
-  def _assert_precision_at_k(self,
-                             predictions,
-                             labels,
-                             k,
-                             expected,
-                             class_id=None,
-                             ignore_mask=None,
-                             weights=None):
-    if ignore_mask is not None:
-      ignore_mask = tf.constant(ignore_mask, tf.bool)
-    if weights is not None:
-      weights = tf.constant(weights, tf.float32)
-    loss, loss_update = metrics.streaming_sparse_precision_at_k(
-        predictions=tf.constant(predictions, tf.float32), labels=labels,
-        k=k, class_id=class_id, ignore_mask=ignore_mask, weights=weights)
+  def _test_streaming_sparse_precision_at_k(self,
+                                            predictions,
+                                            labels,
+                                            k,
+                                            expected,
+                                            class_id=None,
+                                            ignore_mask=None,
+                                            weights=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if ignore_mask is not None:
+        ignore_mask = tf.constant(ignore_mask, tf.bool)
+      if weights is not None:
+        weights = tf.constant(weights, tf.float32)
+      metric, update = metrics.streaming_sparse_precision_at_k(
+          predictions=tf.constant(predictions, tf.float32), labels=labels,
+          k=k, class_id=class_id, ignore_mask=ignore_mask, weights=weights)
 
-    # Fails without initialized vars.
-    self.assertRaises(tf.OpError, loss.eval)
-    self.assertRaises(tf.OpError, loss_update.eval)
-    tf.initialize_variables(tf.local_variables()).run()
+      # Fails without initialized vars.
+      self.assertRaises(tf.OpError, metric.eval)
+      self.assertRaises(tf.OpError, update.eval)
+      tf.initialize_variables(tf.local_variables()).run()
 
-    # Run per-step op and assert expected values.
-    if math.isnan(expected):
-      self.assertTrue(math.isnan(loss_update.eval()))
-      self.assertTrue(math.isnan(loss.eval()))
-    else:
-      self.assertEqual(expected, loss_update.eval())
-      self.assertEqual(expected, loss.eval())
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        self.assertTrue(math.isnan(update.eval()))
+        self.assertTrue(math.isnan(metric.eval()))
+      else:
+        self.assertEqual(expected, update.eval())
+        self.assertEqual(expected, metric.eval())
+
+  def _test_sparse_average_precision_at_k(self,
+                                          predictions,
+                                          labels,
+                                          k,
+                                          expected,
+                                          ignore_mask=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if ignore_mask is not None:
+        ignore_mask = tf.constant(ignore_mask, tf.bool)
+      predictions = tf.constant(predictions, tf.float32)
+      metric = metric_ops.sparse_average_precision_at_k(
+          predictions=predictions, labels=labels, k=k)
+      self.assertAllEqual(expected, metric.eval())
+
+  def _test_streaming_sparse_average_precision_at_k(
+      self, predictions, labels, k, expected, weights=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = tf.constant(weights, tf.float32)
+      predictions = tf.constant(predictions, tf.float32)
+      metric, update = metrics.streaming_sparse_average_precision_at_k(
+          predictions=predictions, labels=labels, k=k, weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(tf.OpError, metric.eval)
+      self.assertRaises(tf.OpError, update.eval)
+      local_variables = tf.local_variables()
+      tf.initialize_variables(local_variables).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        self.assertTrue(math.isnan(update.eval()))
+        self.assertTrue(math.isnan(metric.eval()))
+      else:
+        self.assertAlmostEqual(expected, update.eval())
+        self.assertAlmostEqual(expected, metric.eval())
+
+  def test_average_precision(self):
+    # Example 1.
+    # Matches example here:
+    # fastml.com/what-you-wanted-to-know-about-mean-average-precision
+    labels_ex1 = (0, 1, 2, 3, 4)
+    labels = np.array([labels_ex1], dtype=np.int64)
+    predictions_ex1 = (0.2, 0.1, 0.0, 0.4, 0.0, 0.5, 0.3)  # [5, 3, 6, 1, 2]
+    predictions = (predictions_ex1,)
+    precision_ex1 = (
+        0.0 / 1,
+        1.0 / 2,
+        1.0 / 3,
+        2.0 / 4
+    )
+    avg_precision_ex1 = (
+        0.0 / 1,
+        precision_ex1[1] / 2,
+        precision_ex1[1] / 3,
+        (precision_ex1[1] + precision_ex1[3]) / 4
+    )
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=precision_ex1[i])
+      self._test_sparse_average_precision_at_k(
+          predictions, labels, k, expected=[avg_precision_ex1[i]])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=avg_precision_ex1[i])
+
+    # Example 2.
+    labels_ex2 = (0, 2, 4, 5, 6)
+    labels = np.array([labels_ex2], dtype=np.int64)
+    predictions_ex2 = (0.3, 0.5, 0.0, 0.4, 0.0, 0.1, 0.2)  # [1, 3, 0, 6, 5]
+    predictions = (predictions_ex2,)
+    precision_ex2 = (
+        0.0 / 1,
+        0.0 / 2,
+        1.0 / 3,
+        2.0 / 4
+    )
+    avg_precision_ex2 = (
+        0.0 / 1,
+        0.0 / 2,
+        precision_ex2[2] / 3,
+        (precision_ex2[2] + precision_ex2[3]) / 4
+    )
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=precision_ex2[i])
+      self._test_sparse_average_precision_at_k(
+          predictions, labels, k, expected=[avg_precision_ex2[i]])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=avg_precision_ex2[i])
+
+    # Both examples, we expect both precision and average precision to be the
+    # average of the 2 examples.
+    labels = np.array([labels_ex1, labels_ex2], dtype=np.int64)
+    predictions = (predictions_ex1, predictions_ex2)
+    average_precision = [
+        (ex1, ex2) for ex1, ex2 in zip(avg_precision_ex1, avg_precision_ex2)]
+    streaming_precision = [
+        (ex1 + ex2) / 2
+        for ex1, ex2 in zip(precision_ex1, precision_ex2)]
+    streaming_average_precision = [
+        (ex1 + ex2) / 2
+        for ex1, ex2 in zip(avg_precision_ex1, avg_precision_ex2)]
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_precision_at_k(
+          predictions, labels, k, expected=streaming_precision[i])
+      self._test_sparse_average_precision_at_k(
+          predictions, labels, k, expected=average_precision[i])
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=streaming_average_precision[i])
+
+    # Weighted examples, we expect streaming average precision to be the
+    # weighted average of the 2 examples.
+    weights = (0.3, 0.6)
+    streaming_average_precision = [
+        (weights[0] * ex1 + weights[1] * ex2) / (weights[0] + weights[1])
+        for ex1, ex2 in zip(avg_precision_ex1, avg_precision_ex2)]
+    for i in xrange(4):
+      k = i + 1
+      self._test_streaming_sparse_average_precision_at_k(
+          predictions, labels, k, expected=streaming_average_precision[i],
+          weights=weights)
 
   def test_one_label_at_k1_no_predictions(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     labels = [[0, 0, 0, 1], [0, 0, 1, 0]]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Classes 0,1,2 have 0 predictions, class 4 is out of range.
     for class_id in [0, 1, 2, 4]:
-      with self.test_session():
-        self._assert_precision_at_k(
-            predictions, _binary_2d_label_to_sparse(labels), k=1, expected=NAN,
-            class_id=class_id)
+      self._test_streaming_sparse_precision_at_k(
+          predictions, sp_labels, k=1, expected=NAN, class_id=class_id)
 
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     labels = [[0, 0, 0, 1], [0, 0, 1, 0]]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Class 3: 1 label, 2 predictions, 1 correct.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_2d_label_to_sparse(labels), k=1,
-          expected=1.0 / 2.0, class_id=3)
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=1, expected=1.0 / 2, class_id=3)
 
     # All classes: 2 labels, 2 predictions, 1 correct.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_2d_label_to_sparse(labels), k=1,
-          expected=1.0 / 2.0)
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=1, expected=1.0 / 2)
 
   def test_three_labels_at_k5_no_predictions(self):
     predictions = [
@@ -1530,13 +1881,12 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
         [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
         [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
     ]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Classes 1,3,8 have 0 predictions, class 10 is out of range.
     for class_id in [1, 3, 8, 10]:
-      with self.test_session():
-        self._assert_precision_at_k(
-            predictions, _binary_2d_label_to_sparse(labels), k=5, expected=NAN,
-            class_id=class_id)
+      self._test_streaming_sparse_precision_at_k(
+          predictions, sp_labels, k=5, expected=NAN, class_id=class_id)
 
   def test_three_labels_at_k5_no_labels(self):
     predictions = [
@@ -1547,13 +1897,12 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
         [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
         [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
     ]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Classes 0,4,6,9: 0 labels, >=1 prediction.
     for class_id in [0, 4, 6, 9]:
-      with self.test_session():
-        self._assert_precision_at_k(
-            predictions, _binary_2d_label_to_sparse(labels), k=5, expected=0.0,
-            class_id=class_id)
+      self._test_streaming_sparse_precision_at_k(
+          predictions, sp_labels, k=5, expected=0.0, class_id=class_id)
 
   def test_three_labels_at_k5(self):
     predictions = [
@@ -1564,30 +1913,23 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
         [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
         [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
     ]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Class 2: 2 labels, 2 correct predictions.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_2d_label_to_sparse(labels), k=5,
-          expected=2.0 / 2.0, class_id=2)
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=2.0 / 2, class_id=2)
 
     # Class 5: 1 label, 1 correct prediction.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_2d_label_to_sparse(labels), k=5,
-          expected=1.0 / 1.0, class_id=5)
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=1.0 / 1, class_id=5)
 
     # Class 7: 1 label, 1 incorrect prediction.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_2d_label_to_sparse(labels), k=5,
-          expected=0.0 / 1.0, class_id=7)
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=0.0 / 1, class_id=7)
 
     # All classes: 10 predictions, 3 correct.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_2d_label_to_sparse(labels), k=5,
-          expected=3.0 / 10.0)
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=3.0 / 10)
 
   def test_3d_no_predictions(self):
     predictions = [[
@@ -1604,13 +1946,12 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
         [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
     ]]
+    sp_labels = _binary_3d_label_to_sparse_value(labels)
 
     # Classes 1,3,8 have 0 predictions, class 10 is out of range.
     for class_id in [1, 3, 8, 10]:
-      with self.test_session():
-        self._assert_precision_at_k(
-            predictions, _binary_3d_label_to_sparse(labels), k=5, expected=NAN,
-            class_id=class_id)
+      self._test_streaming_sparse_precision_at_k(
+          predictions, sp_labels, k=5, expected=NAN, class_id=class_id)
 
   def test_3d_no_labels(self):
     predictions = [[
@@ -1627,13 +1968,12 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
         [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
     ]]
+    sp_labels = _binary_3d_label_to_sparse_value(labels)
 
     # Classes 0,4,6,9: 0 labels, >=1 prediction.
     for class_id in [0, 4, 6, 9]:
-      with self.test_session():
-        self._assert_precision_at_k(
-            predictions, _binary_3d_label_to_sparse(labels), k=5, expected=0.0,
-            class_id=class_id)
+      self._test_streaming_sparse_precision_at_k(
+          predictions, sp_labels, k=5, expected=0.0, class_id=class_id)
 
   def test_3d(self):
     predictions = [[
@@ -1650,30 +1990,23 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
         [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
     ]]
+    sp_labels = _binary_3d_label_to_sparse_value(labels)
 
     # Class 2: 4 predictions, all correct.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=4.0 / 4.0, class_id=2)
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=4.0 / 4, class_id=2)
 
     # Class 5: 2 predictions, both correct.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=2.0 / 2.0, class_id=5)
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=2.0 / 2, class_id=5)
 
     # Class 7: 2 predictions, 1 correct.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=1.0 / 2.0, class_id=7)
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=1.0 / 2, class_id=7)
 
     # All classes: 20 predictions, 7 correct.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=7.0 / 20.0)
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=7.0 / 20)
 
   def test_3d_ignore_all(self):
     predictions = [[
@@ -1690,24 +2023,21 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
         [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
     ]]
+    sp_labels = _binary_3d_label_to_sparse_value(labels)
 
     for class_id in xrange(10):
-      with self.test_session():
-        self._assert_precision_at_k(
-            predictions, _binary_3d_label_to_sparse(labels), k=5, expected=NAN,
-            class_id=class_id, weights=[[0], [0]])
-      with self.test_session():
-        self._assert_precision_at_k(
-            predictions, _binary_3d_label_to_sparse(labels), k=5, expected=NAN,
-            class_id=class_id, weights=[[0, 0], [0, 0]])
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5, expected=NAN,
-          ignore_mask=[[False], [True]], weights=[[0], [1]])
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5, expected=NAN,
+      self._test_streaming_sparse_precision_at_k(
+          predictions, sp_labels, k=5, expected=NAN, class_id=class_id,
+          weights=[[0], [0]])
+      self._test_streaming_sparse_precision_at_k(
+          predictions, sp_labels, k=5, expected=NAN, class_id=class_id,
           weights=[[0, 0], [0, 0]])
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=NAN,
+        ignore_mask=[[False], [True]], weights=[[0], [1]])
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=NAN,
+        weights=[[0, 0], [0, 0]])
 
   def test_3d_ignore_some(self):
     predictions = [[
@@ -1724,46 +2054,37 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
         [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
     ]]
+    sp_labels = _binary_3d_label_to_sparse_value(labels)
 
     # Class 2: 2 predictions, both correct.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=2.0 / 2.0, class_id=2, ignore_mask=[[False], [False]],
-          weights=[[1], [0]])
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=2.0 / 2.0, class_id=2,
+        ignore_mask=[[False], [False]], weights=[[1], [0]])
 
     # Class 2: 2 predictions, both correct.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=2.0 / 2.0, class_id=2, ignore_mask=[[False], [False]],
-          weights=[[0], [1]])
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=2.0 / 2.0, class_id=2,
+        ignore_mask=[[False], [False]], weights=[[0], [1]])
 
     # Class 7: 1 incorrect prediction.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=0.0 / 1.0, class_id=7, ignore_mask=[[False], [True]],
-          weights=[[1], [1]])
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=0.0 / 1.0, class_id=7,
+        ignore_mask=[[False], [True]], weights=[[1], [1]])
 
     # Class 7: 1 correct prediction.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=1.0 / 1.0, class_id=7, ignore_mask=[[True], [False]],
-          weights=[[1], [1]])
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=1.0 / 1.0, class_id=7,
+        ignore_mask=[[True], [False]], weights=[[1], [1]])
 
     # Class 7: no predictions.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=NAN, class_id=7, weights=[[1, 0], [0, 1]])
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=NAN, class_id=7,
+        weights=[[1, 0], [0, 1]])
 
     # Class 7: 2 predictions, 1 correct.
-    with self.test_session():
-      self._assert_precision_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=1.0 / 2.0, class_id=7, weights=[[0, 1], [1, 0]])
+    self._test_streaming_sparse_precision_at_k(
+        predictions, sp_labels, k=5, expected=1.0 / 2.0, class_id=7,
+        weights=[[0, 1], [1, 0]])
 
   def test_sparse_tensor_value(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
@@ -1781,72 +2102,143 @@ class StreamingSparsePrecisionTest(tf.test.TestCase):
 
 class StreamingSparseRecallTest(tf.test.TestCase):
 
-  def _assert_recall_at_k(self,
-                          predictions,
-                          labels,
-                          k,
-                          expected,
-                          class_id=None,
-                          ignore_mask=None,
-                          weights=None):
-    if ignore_mask is not None:
-      ignore_mask = tf.constant(ignore_mask, tf.bool)
-    if weights is not None:
-      weights = tf.constant(weights, tf.float32)
-    loss, loss_update = metrics.streaming_sparse_recall_at_k(
-        predictions=tf.constant(predictions, tf.float32),
-        labels=labels, k=k, class_id=class_id, ignore_mask=ignore_mask,
-        weights=weights)
+  def _test_streaming_sparse_recall_at_k(self,
+                                         predictions,
+                                         labels,
+                                         k,
+                                         expected,
+                                         class_id=None,
+                                         ignore_mask=None,
+                                         weights=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if ignore_mask is not None:
+        ignore_mask = tf.constant(ignore_mask, tf.bool)
+      if weights is not None:
+        weights = tf.constant(weights, tf.float32)
+      metric, update = metrics.streaming_sparse_recall_at_k(
+          predictions=tf.constant(predictions, tf.float32),
+          labels=labels, k=k, class_id=class_id, ignore_mask=ignore_mask,
+          weights=weights)
 
-    # Fails without initialized vars.
-    self.assertRaises(tf.OpError, loss.eval)
-    self.assertRaises(tf.OpError, loss_update.eval)
-    tf.initialize_variables(tf.local_variables()).run()
+      # Fails without initialized vars.
+      self.assertRaises(tf.OpError, metric.eval)
+      self.assertRaises(tf.OpError, update.eval)
+      tf.initialize_variables(tf.local_variables()).run()
 
-    # Run per-step op and assert expected values.
-    if math.isnan(expected):
-      self.assertTrue(math.isnan(loss_update.eval()))
-      self.assertTrue(math.isnan(loss.eval()))
-    else:
-      self.assertEqual(expected, loss_update.eval())
-      self.assertEqual(expected, loss.eval())
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        self.assertTrue(math.isnan(update.eval()))
+        self.assertTrue(math.isnan(metric.eval()))
+      else:
+        self.assertEqual(expected, update.eval())
+        self.assertEqual(expected, metric.eval())
 
   def test_one_label_at_k1_empty_classes(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     labels = [[0, 0, 0, 1], [0, 0, 1, 0]]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Classes 0,1 have 0 labels, 0 predictions, class 4 is out of range.
     for class_id in [0, 1, 4]:
-      with self.test_session():
-        self._assert_recall_at_k(
-            predictions=predictions, labels=_binary_2d_label_to_sparse(labels),
-            k=1, expected=NAN, class_id=class_id)
+      self._test_streaming_sparse_recall_at_k(
+          predictions=predictions, labels=sp_labels, k=1, expected=NAN,
+          class_id=class_id)
 
   def test_one_label_at_k1_no_predictions(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     labels = [[0, 0, 0, 1], [0, 0, 1, 0]]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Class 2: 0 predictions.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions=predictions, labels=_binary_2d_label_to_sparse(labels),
-          k=1, expected=0.0, class_id=2)
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=0.0,
+        class_id=2)
 
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     labels = [[0, 0, 0, 1], [0, 0, 1, 0]]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Class 3: 1 label, 2 predictions, 1 correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions=predictions, labels=_binary_2d_label_to_sparse(labels),
-          k=1, expected=1.0 / 1.0, class_id=3)
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=1.0 / 1,
+        class_id=3)
 
     # All classes: 2 labels, 2 predictions, 1 correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions=predictions, labels=_binary_2d_label_to_sparse(labels),
-          k=1, expected=1.0 / 2.0)
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=1.0 / 2)
+
+  def test_one_label_at_k1_weighted(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    labels = [[0, 0, 0, 1], [0, 0, 1, 0]]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
+
+    # Class 3: 1 label, 2 predictions, 1 correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=NAN,
+        class_id=3, weights=(0.0,))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=1.0 / 1,
+        class_id=3, weights=(1.0,))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=1.0 / 1,
+        class_id=3, weights=(2.0,))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=NAN,
+        class_id=3, weights=(0.0, 0.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=NAN,
+        class_id=3, weights=(0.0, 1.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=1.0 / 1,
+        class_id=3, weights=(1.0, 0.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=1.0 / 1,
+        class_id=3, weights=(1.0, 1.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=2.0 / 2,
+        class_id=3, weights=(2.0, 3.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=3.0 / 3,
+        class_id=3, weights=(3.0, 2.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=0.3 / 0.3,
+        class_id=3, weights=(0.3, 0.6))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=0.6 / 0.6,
+        class_id=3, weights=(0.6, 0.3))
+
+    # All classes: 2 labels, 2 predictions, 1 correct.
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=NAN,
+        weights=(0.0,))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=1.0 / 2,
+        weights=(1.0,))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=1.0 / 2,
+        weights=(2.0,))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=1.0 / 1,
+        weights=(1.0, 0.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=0.0 / 1,
+        weights=(0.0, 1.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=1.0 / 2,
+        weights=(1.0, 1.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=2.0 / 5,
+        weights=(2.0, 3.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=3.0 / 5,
+        weights=(3.0, 2.0))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=0.3 / 0.9,
+        weights=(0.3, 0.6))
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=1, expected=0.6 / 0.9,
+        weights=(0.6, 0.3))
 
   def test_three_labels_at_k5_no_labels(self):
     predictions = [
@@ -1855,13 +2247,13 @@ class StreamingSparseRecallTest(tf.test.TestCase):
     labels = [
         [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
         [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
     for class_id in [0, 3, 4, 6, 9, 10]:
-      with self.test_session():
-        self._assert_recall_at_k(
-            predictions=predictions, labels=_binary_2d_label_to_sparse(labels),
-            k=5, expected=NAN, class_id=class_id)
+      self._test_streaming_sparse_recall_at_k(
+          predictions=predictions, labels=sp_labels, k=5, expected=NAN,
+          class_id=class_id)
 
   def test_three_labels_at_k5_no_predictions(self):
     predictions = [
@@ -1870,12 +2262,12 @@ class StreamingSparseRecallTest(tf.test.TestCase):
     labels = [
         [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
         [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Class 8: 1 label, no predictions.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions=predictions, labels=_binary_2d_label_to_sparse(labels),
-          k=5, expected=0.0 / 1.0, class_id=8)
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=0.0 / 1,
+        class_id=8)
 
   def test_three_labels_at_k5(self):
     predictions = [
@@ -1884,30 +2276,26 @@ class StreamingSparseRecallTest(tf.test.TestCase):
     labels = [
         [0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
         [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]]
+    sp_labels = _binary_2d_label_to_sparse_value(labels)
 
     # Class 2: 2 labels, both correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions=predictions, labels=_binary_2d_label_to_sparse(labels),
-          k=5, expected=2.0 / 2.0, class_id=2)
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=2.0 / 2,
+        class_id=2)
 
     # Class 5: 1 label, incorrect.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions=predictions, labels=_binary_2d_label_to_sparse(labels),
-          k=5, expected=1.0 / 1.0, class_id=5)
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=1.0 / 1,
+        class_id=5)
 
     # Class 7: 1 label, incorrect.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions=predictions, labels=_binary_2d_label_to_sparse(labels),
-          k=5, expected=0.0 / 1.0, class_id=7)
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=0.0 / 1,
+        class_id=7)
 
     # All classes: 6 labels, 3 correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions=predictions, labels=_binary_2d_label_to_sparse(labels),
-          k=5, expected=3.0 / 6.0)
+    self._test_streaming_sparse_recall_at_k(
+        predictions=predictions, labels=sp_labels, k=5, expected=3.0 / 6)
 
   def test_3d_no_labels(self):
     predictions = [[
@@ -1924,13 +2312,12 @@ class StreamingSparseRecallTest(tf.test.TestCase):
         [0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]
     ]]
+    sp_labels = _binary_3d_label_to_sparse_value(labels)
 
     # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
     for class_id in [0, 3, 4, 6, 9, 10]:
-      with self.test_session():
-        self._assert_recall_at_k(
-            predictions, _binary_3d_label_to_sparse(labels), k=5, expected=NAN,
-            class_id=class_id)
+      self._test_streaming_sparse_recall_at_k(
+          predictions, sp_labels, k=5, expected=NAN, class_id=class_id)
 
   def test_3d_no_predictions(self):
     predictions = [[
@@ -1947,13 +2334,12 @@ class StreamingSparseRecallTest(tf.test.TestCase):
         [0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]
     ]]
+    sp_labels = _binary_3d_label_to_sparse_value(labels)
 
     # Classes 1,8 have 0 predictions, >=1 label.
     for class_id in [1, 8]:
-      with self.test_session():
-        self._assert_recall_at_k(
-            predictions, _binary_3d_label_to_sparse(labels), k=5, expected=0.0,
-            class_id=class_id)
+      self._test_streaming_sparse_recall_at_k(
+          predictions, sp_labels, k=5, expected=0.0, class_id=class_id)
 
   def test_3d(self):
     predictions = [[
@@ -1970,30 +2356,23 @@ class StreamingSparseRecallTest(tf.test.TestCase):
         [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
     ]]
+    sp_labels = _binary_3d_label_to_sparse_value(labels)
 
     # Class 2: 4 labels, all correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=4.0 / 4.0, class_id=2)
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=4.0 / 4, class_id=2)
 
     # Class 5: 2 labels, both correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=2.0 / 2.0, class_id=5)
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=2.0 / 2, class_id=5)
 
     # Class 7: 2 labels, 1 incorrect.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=1.0 / 2.0, class_id=7)
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=1.0 / 2, class_id=7)
 
     # All classes: 12 labels, 7 correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=7.0 / 12.0)
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=7.0 / 12)
 
   def test_3d_ignore_all(self):
     predictions = [[
@@ -2010,24 +2389,20 @@ class StreamingSparseRecallTest(tf.test.TestCase):
         [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
     ]]
+    sp_labels = _binary_3d_label_to_sparse_value(labels)
 
     for class_id in xrange(10):
-      with self.test_session():
-        self._assert_recall_at_k(
-            predictions, _binary_3d_label_to_sparse(labels), k=5, expected=NAN,
-            class_id=class_id, weights=[[0], [0]])
-      with self.test_session():
-        self._assert_recall_at_k(
-            predictions, _binary_3d_label_to_sparse(labels), k=5, expected=NAN,
-            class_id=class_id, weights=[[0, 0], [0, 0]])
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5, expected=NAN,
-          ignore_mask=[[False], [True]], weights=[[0], [1]])
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5, expected=NAN,
+      self._test_streaming_sparse_recall_at_k(
+          predictions, sp_labels, k=5, expected=NAN, class_id=class_id,
+          weights=[[0], [0]])
+      self._test_streaming_sparse_recall_at_k(
+          predictions, sp_labels, k=5, expected=NAN, class_id=class_id,
           weights=[[0, 0], [0, 0]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=NAN,
+        ignore_mask=[[False], [True]], weights=[[0], [1]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=NAN, weights=[[0, 0], [0, 0]])
 
   def test_3d_ignore_some(self):
     predictions = [[
@@ -2044,46 +2419,37 @@ class StreamingSparseRecallTest(tf.test.TestCase):
         [0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]
     ]]
+    sp_labels = _binary_3d_label_to_sparse_value(labels)
 
     # Class 2: 2 labels, both correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=2.0 / 2.0, class_id=2, ignore_mask=[[False], [False]],
-          weights=[[1], [0]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=2.0 / 2.0, class_id=2,
+        ignore_mask=[[False], [False]], weights=[[1], [0]])
 
     # Class 2: 2 labels, both correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=2.0 / 2.0, class_id=2, ignore_mask=[[False], [False]],
-          weights=[[0], [1]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=2.0 / 2.0, class_id=2,
+        ignore_mask=[[False], [False]], weights=[[0], [1]])
 
     # Class 7: 1 label, correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=1.0 / 1.0, class_id=7, ignore_mask=[[True], [False]],
-          weights=[[1], [1]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=1.0 / 1.0, class_id=7,
+        ignore_mask=[[True], [False]], weights=[[1], [1]])
 
     # Class 7: 1 label, incorrect.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=0.0 / 1.0, class_id=7, ignore_mask=[[False], [True]],
-          weights=[[1], [1]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=0.0 / 1.0, class_id=7,
+        ignore_mask=[[False], [True]], weights=[[1], [1]])
 
     # Class 7: 2 labels, 1 correct.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=1.0 / 2.0, class_id=7, weights=[[1, 0], [1, 0]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=1.0 / 2.0, class_id=7,
+        weights=[[1, 0], [1, 0]])
 
     # Class 7: No labels.
-    with self.test_session():
-      self._assert_recall_at_k(
-          predictions, _binary_3d_label_to_sparse(labels), k=5,
-          expected=NAN, class_id=7, weights=[[0, 1], [0, 1]])
+    self._test_streaming_sparse_recall_at_k(
+        predictions, sp_labels, k=5, expected=NAN, class_id=7,
+        weights=[[0, 1], [0, 1]])
 
   def test_sparse_tensor_value(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
@@ -2323,9 +2689,9 @@ class StreamingMeanSquaredErrorTest(tf.test.TestCase):
 
       sess.run(tf.initialize_local_variables())
       sess.run(update_op)
-      self.assertAlmostEqual(208 / 6.0, sess.run(update_op), 5)
+      self.assertAlmostEqual(208.0 / 6, sess.run(update_op), 5)
 
-      self.assertAlmostEqual(208 / 6.0, error.eval(), 5)
+      self.assertAlmostEqual(208.0 / 6, error.eval(), 5)
 
   def testMetricsComputedConcurrently(self):
     with self.test_session() as sess:
@@ -2363,8 +2729,8 @@ class StreamingMeanSquaredErrorTest(tf.test.TestCase):
       sess.run([update_op0, update_op1])
 
       mse0, mse1 = sess.run([mse0, mse1])
-      self.assertAlmostEqual(208 / 6.0, mse0, 5)
-      self.assertAlmostEqual(79 / 6.0, mse1, 5)
+      self.assertAlmostEqual(208.0 / 6, mse0, 5)
+      self.assertAlmostEqual(79.0 / 6, mse1, 5)
 
   def testMultipleMetricsOnMultipleBatchesOfSizeOne(self):
     with self.test_session() as sess:
@@ -2389,8 +2755,8 @@ class StreamingMeanSquaredErrorTest(tf.test.TestCase):
       sess.run([ma_update_op, ms_update_op])
       sess.run([ma_update_op, ms_update_op])
 
-      self.assertAlmostEqual(32 / 6.0, mae.eval(), 5)
-      self.assertAlmostEqual(208 / 6.0, mse.eval(), 5)
+      self.assertAlmostEqual(32.0 / 6, mae.eval(), 5)
+      self.assertAlmostEqual(208.0 / 6, mse.eval(), 5)
 
 
 class StreamingRootMeanSquaredErrorTest(tf.test.TestCase):
@@ -3286,5 +3652,264 @@ class AggregateMetricMapTest(tf.test.TestCase):
       self.assertEqual(4, names_to_values['m2'].eval())
 
 
+class NumRelevantTest(tf.test.TestCase):
+
+  def testNumRelevantInvalidArgs(self):
+    labels = tf.random_uniform(
+        shape=(3, 3, 3), minval=0, maxval=100, dtype=tf.int32)
+    with self.assertRaisesRegexp(ValueError, 'nvalid k'):
+      metric_ops.num_relevant(labels, k=0)
+    with self.assertRaisesRegexp(ValueError, 'nvalid k'):
+      metric_ops.num_relevant(labels, k=-1)
+
+  def testNumRelevantDense(self):
+    with self.test_session():
+      labels = tf.random_uniform(
+          shape=(3, 3, 3), minval=0, maxval=100, dtype=tf.int32)
+      ones = np.ones(shape=(3, 3))
+      self.assertAllEqual(ones, metric_ops.num_relevant(labels, k=1).eval())
+      twos = ones * 2
+      self.assertAllEqual(twos, metric_ops.num_relevant(labels, k=2).eval())
+      threes = ones * 3
+      self.assertAllEqual(threes, metric_ops.num_relevant(labels, k=3).eval())
+      self.assertAllEqual(threes, metric_ops.num_relevant(labels, k=4).eval())
+      self.assertAllEqual(threes, metric_ops.num_relevant(labels, k=999).eval())
+
+  def testNumRelevantSparse(self):
+    with self.test_session():
+      labels = tf.SparseTensorValue(
+          indices=(
+              (0, 0, 0), (0, 0, 1),
+              (0, 1, 0), (0, 1, 1), (0, 1, 2),
+              # (0, 2) missing
+              (1, 0, 0), (1, 0, 1), (1, 0, 2),
+              (1, 1, 0),
+              (1, 2, 0),
+              # (2, 0) missing
+              (2, 1, 0), (2, 1, 1),
+              (2, 2, 0)),
+          values=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13),
+          shape=(3, 3, 3))
+      self.assertAllEqual(
+          ((1, 1, 0), (1, 1, 1), (0, 1, 1)),
+          metric_ops.num_relevant(labels, k=1).eval())
+      self.assertAllEqual(
+          ((2, 2, 0), (2, 1, 1), (0, 2, 1)),
+          metric_ops.num_relevant(labels, k=2).eval())
+      label_lengths = ((2, 3, 0), (3, 1, 1), (0, 2, 1))
+      self.assertAllEqual(
+          label_lengths, metric_ops.num_relevant(labels, k=3).eval())
+      self.assertAllEqual(
+          label_lengths, metric_ops.num_relevant(labels, k=999).eval())
+
+
+class ExpandAndTileTest(tf.test.TestCase):
+
+  def testExpandAndTileInvalidArgs(self):
+    x = tf.ones(shape=(3, 3, 3))
+    with self.assertRaisesRegexp(ValueError, 'nvalid multiple'):
+      metric_ops.expand_and_tile(x, multiple=0)
+    with self.test_session():
+      with self.assertRaises(ValueError):
+        metric_ops.expand_and_tile(x, multiple=1, dim=-4).eval()
+      with self.assertRaises(ValueError):
+        metric_ops.expand_and_tile(x, multiple=1, dim=4).eval()
+
+  def testSparseExpandAndTileInvalidArgs(self):
+    x = tf.SparseTensorValue(
+        indices=[
+            (i, j, k) for i in range(3) for j in range(3) for k in range(3)],
+        values=[1] * 27,
+        shape=[3, 3, 3])
+    with self.assertRaisesRegexp(ValueError, 'nvalid multiple'):
+      metric_ops.expand_and_tile(x, multiple=0)
+    with self.test_session():
+      with self.assertRaises(tf.OpError):
+        metric_ops.expand_and_tile(x, multiple=1, dim=-4).eval()
+      with self.assertRaises(ValueError):
+        metric_ops.expand_and_tile(x, multiple=1, dim=4).eval()
+
+  def _test_expand_and_tile(
+      self, expected_shape, expected_value, tensor, multiple, dim=None):
+    with tf.Graph().as_default() as g, self.test_session(g):
+      if dim is None:
+        op = metric_ops.expand_and_tile(tensor=tensor, multiple=multiple)
+      else:
+        op = metric_ops.expand_and_tile(
+            tensor=tensor, multiple=multiple, dim=dim)
+      self.assertAllEqual(expected_shape, tf.shape(op).eval())
+      self.assertAllEqual(expected_value, op.eval())
+
+  # TODO(ptucker): Use @parameterized when it's available in tf.
+  def testExpandAndTile1x(self):
+    # Shape (3,3,3).
+    x = ((
+        (1, 2, 3),
+        (4, 5, 6),
+        (7, 8, 9)
+    ), (
+        (10, 11, 12),
+        (13, 14, 15),
+        (16, 17, 18)
+    ), (
+        (19, 20, 21),
+        (22, 23, 24),
+        (25, 26, 26)
+    ))
+    for dim in (None, -3, 0):
+      self._test_expand_and_tile(
+          expected_shape=(1, 3, 3, 3),
+          expected_value=[x],
+          tensor=x, multiple=1, dim=dim)
+
+    for dim in (-2, 1):
+      self._test_expand_and_tile(
+          expected_shape=(3, 1, 3, 3),
+          expected_value=[[x1] for x1 in x],
+          tensor=x, multiple=1, dim=dim)
+
+    for dim in (-1, 2):
+      self._test_expand_and_tile(
+          expected_shape=(3, 3, 1, 3),
+          expected_value=[[[x2] for x2 in x1] for x1 in x],
+          tensor=x, multiple=1, dim=dim)
+
+    self._test_expand_and_tile(
+        expected_shape=(3, 3, 3, 1),
+        expected_value=[[[[x3] for x3 in x2] for x2 in x1] for x1 in x],
+        tensor=x, multiple=1, dim=3)
+
+  # TODO(ptucker): Use @parameterized when it's available in tf.
+  def testExpandAndTile5x(self):
+    # Shape (3,3,3).
+    x = ((
+        (1, 2, 3),
+        (4, 5, 6),
+        (7, 8, 9)
+    ), (
+        (10, 11, 12),
+        (13, 14, 15),
+        (16, 17, 18)
+    ), (
+        (19, 20, 21),
+        (22, 23, 24),
+        (25, 26, 26)
+    ))
+    with self.test_session():
+      for dim in (None, -3, 0):
+        self._test_expand_and_tile(
+            expected_shape=(5, 3, 3, 3),
+            expected_value=[x] * 5,
+            tensor=x, multiple=5, dim=dim)
+
+      for dim in (-2, 1):
+        self._test_expand_and_tile(
+            expected_shape=(3, 5, 3, 3),
+            expected_value=[[x1] * 5 for x1 in x],
+            tensor=x, multiple=5, dim=dim)
+
+      for dim in (-1, 2):
+        self._test_expand_and_tile(
+            expected_shape=(3, 3, 5, 3),
+            expected_value=[[[x2] * 5 for x2 in x1] for x1 in x],
+            tensor=x, multiple=5, dim=dim)
+
+    self._test_expand_and_tile(
+        expected_shape=(3, 3, 3, 5),
+        expected_value=[[[[x3] * 5 for x3 in x2] for x2 in x1] for x1 in x],
+        tensor=x, multiple=5, dim=3)
+
+  def _assert_sparse_tensors_equal(self, expected, actual):
+    self.assertAllEqual(expected.indices, actual.indices)
+    self.assertAllEqual(expected.values, actual.values)
+    self.assertAllEqual(expected.shape, actual.shape)
+
+  # TODO(ptucker): Use @parameterized when it's available in tf.
+  def testSparseExpandAndTile1x(self):
+    # Shape (3,3).
+    x = tf.SparseTensorValue(
+        indices=[
+            [0, 0], [0, 1],
+            [1, 0], [1, 1], [1, 2],
+            [2, 0]],
+        values=[
+            1, 2,
+            3, 4, 5,
+            6],
+        shape=[3, 3])
+    with self.test_session():
+      expected_result_dim0 = tf.SparseTensorValue(
+          indices=[[0, i[0], i[1]] for i in x.indices], values=x.values,
+          shape=[1, 3, 3])
+      self._assert_sparse_tensors_equal(
+          expected_result_dim0, metric_ops.expand_and_tile(x, multiple=1).eval())
+      for dim in (-2, 0):
+        self._assert_sparse_tensors_equal(
+            expected_result_dim0,
+            metric_ops.expand_and_tile(x, multiple=1, dim=dim).eval())
+
+      expected_result_dim1 = tf.SparseTensorValue(
+          indices=[[i[0], 0, i[1]] for i in x.indices], values=x.values,
+          shape=[3, 1, 3])
+      for dim in (-1, 1):
+        self._assert_sparse_tensors_equal(
+            expected_result_dim1,
+            metric_ops.expand_and_tile(x, multiple=1, dim=dim).eval())
+
+      expected_result_dim2 = tf.SparseTensorValue(
+          indices=[[i[0], i[1], 0] for i in x.indices], values=x.values,
+          shape=[3, 3, 1])
+      self._assert_sparse_tensors_equal(
+          expected_result_dim2,
+          metric_ops.expand_and_tile(x, multiple=1, dim=2).eval())
+
+  # TODO(ptucker): Use @parameterized when it's available in tf.
+  def testSparseExpandAndTile5x(self):
+    # Shape (3,3).
+    x = tf.SparseTensorValue(
+        indices=(
+            (0, 0), (0, 1),
+            (1, 0), (1, 1), (1, 2),
+            (2, 0)),
+        values=(
+            1, 2,
+            3, 4, 5,
+            6),
+        shape=(3, 3))
+    with self.test_session():
+      expected_result_dim0 = tf.SparseTensorValue(
+          indices=[(d0, i[0], i[1]) for d0 in range(5) for i in x.indices],
+          values=[v for _ in range(5) for v in x.values],
+          shape=(5, 3, 3))
+      self._assert_sparse_tensors_equal(
+          expected_result_dim0,
+          metric_ops.expand_and_tile(x, multiple=5).eval())
+      for dim in (-2, 0):
+        self._assert_sparse_tensors_equal(
+            expected_result_dim0,
+            metric_ops.expand_and_tile(x, multiple=5, dim=dim).eval())
+
+      expected_result_dim1 = tf.SparseTensorValue(
+          indices=[
+              (d0, d1, i[1])
+              for d0 in range(3)
+              for d1 in range(5)
+              for i in x.indices if i[0] == d0],
+          values=x.values[0:2] * 5 + x.values[2:5] * 5 + x.values[5:] * 5,
+          shape=(3, 5, 3))
+      for dim in (-1, 1):
+        self._assert_sparse_tensors_equal(
+            expected_result_dim1,
+            metric_ops.expand_and_tile(x, multiple=5, dim=dim).eval())
+
+      expected_result_dim2 = tf.SparseTensorValue(
+          indices=[(i[0], i[1], d2) for i in x.indices for d2 in range(5)],
+          values=[v for v in x.values for _ in range(5)],
+          shape=(3, 3, 5))
+      self._assert_sparse_tensors_equal(
+          expected_result_dim2,
+          metric_ops.expand_and_tile(x, multiple=5, dim=2).eval())
+
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/quantization/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc b/tensorflow/contrib/quantization/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc
index 36bd77ecfe1..3d139fbe0a0 100644
--- a/tensorflow/contrib/quantization/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc
+++ b/tensorflow/contrib/quantization/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc
@@ -46,7 +46,7 @@ class QuantizedMatMulOpForHexagonTest : public OpsTestBase {
               << ", hexagon binary version = "
               << hexagon_gemm_wrapper_GetHexagonBinaryVersion() << ")";
     LOG(INFO) << "Cpu frequency = "
-              << profile_utils::CpuUtils::GetCpuFrequency();
+              << profile_utils::CpuUtils::GetCycleCounterFrequency();
 #else
     LOG(WARNING) << "Hexagon libs are not linked.";
 #endif
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 59fecea0c99..0f3cf62d438 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -663,7 +663,7 @@ def train(train_op,
       raise ValueError('Cannot provide trace_every_n_steps because '
                        'logdir=None')
 
-  if sync_optimizer and startup_delay_steps > 0:
+  if sync_optimizer is not None and startup_delay_steps > 0:
     raise ValueError(
         'startup_delay_steps must be zero when sync_optimizer is supplied.')
 
@@ -697,7 +697,7 @@ def train(train_op,
 
     cleanup_op = None
 
-    if is_chief and sync_optimizer:
+    if is_chief and sync_optimizer is not None:
       if not isinstance(sync_optimizer,
                         sync_replicas_optimizer.SyncReplicasOptimizer):
         raise ValueError(
@@ -761,7 +761,7 @@ def train(train_op,
                              number_of_steps or sys.maxint))
         sv.start_queue_runners(sess)
         logging.info('Starting Queues.')
-        if is_chief and sync_optimizer:
+        if is_chief and sync_optimizer is not None:
           sv.start_queue_runners(sess, [chief_queue_runner])
         try:
           while not sv.should_stop():
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
new file mode 100644
index 00000000000..94b7222737a
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -0,0 +1,68 @@
+# Description:
+# TensorBoard module containing volatile or experimental code.
+
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# For platform specific build config
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+tf_proto_library(
+    name = "protos_all",
+    srcs = glob(["**/*.proto"]),
+    go_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
+# API methods in `tf.contrib.tensorboard` package.
+py_library(
+    name = "tensorboard",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [":plugins"],
+)
+
+# API methods in `tf.contrib.tensorboard.plugins` package.
+py_library(
+    name = "plugins",
+    srcs = ["plugins/__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [":projector"],
+)
+
+# API methods and protos in `tf.contrib.tensorboard.plugins.projector` package.
+py_library(
+    name = "projector",
+    srcs = ["plugins/projector/__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":protos_all_py",
+        "//tensorflow/python:lib",
+    ],
+)
+
+py_test(
+    name = "projector_api_test",
+    size = "small",
+    srcs = ["plugins/projector/projector_api_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":projector",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/tensorboard/__init__.py b/tensorflow/contrib/tensorboard/__init__.py
new file mode 100644
index 00000000000..129f5feacf2
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""tensorboard module containing volatile or experimental code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Add projects here, they will show up under tf.contrib.tensorboard.
+from tensorflow.contrib.tensorboard import plugins
diff --git a/tensorflow/contrib/tensorboard/plugins/__init__.py b/tensorflow/contrib/tensorboard/plugins/__init__.py
new file mode 100644
index 00000000000..88336714a7c
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/plugins/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""tensorboard plugins module containing volatile or experimental code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Add projects here, they will show up under tf.contrib.tensorboard.plugins
+from tensorflow.contrib.tensorboard.plugins import projector
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/__init__.py b/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
new file mode 100644
index 00000000000..09a8b592f7f
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
@@ -0,0 +1,54 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Public API for the Embedding Projector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from google.protobuf import text_format
+from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import EmbeddingInfo
+from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import ProjectorConfig
+from tensorflow.python.lib.io import file_io
+
+PROJECTOR_FILENAME = 'projector_config.pbtxt'
+
+
+def visualize_embeddings(summary_writer, config):
+  """Stores a config file used by the embedding projector.
+
+  Args:
+    summary_writer: The summary writer used for writting events.
+    config: `tf.contrib.tensorboard.plugins.projector.ProjectorConfig`
+      proto that holds the configuration for the projector such as paths to
+      checkpoint files and metadata files for the embeddings. If
+      `config.model_checkpoint_path` is none, it defaults to the
+      `logdir` used by the summary_writer.
+
+  Raises:
+    ValueError: If the summary writer does not have a `logdir`.
+  """
+  logdir = summary_writer.get_logdir()
+
+  # Sanity checks.
+  if logdir is None:
+    raise ValueError('Summary writer must have a logdir')
+
+  # Saving the config file in the logdir.
+  config_pbtxt = text_format.MessageToString(config)
+  file_io.write_string_to_file(
+      os.path.join(logdir, PROJECTOR_FILENAME), config_pbtxt)
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
new file mode 100644
index 00000000000..3114b751b56
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
@@ -0,0 +1,49 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""API tests for the projector plugin in TensorBoard."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tensorflow as tf
+
+from google.protobuf import text_format
+
+
+class ProjectorApiTest(tf.test.TestCase):
+
+  def testVisualizeEmbeddings(self):
+    # Create a dummy configuration.
+    config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
+    config.model_checkpoint_path = 'test'
+    emb1 = config.embedding.add()
+    emb1.tensor_name = 'tensor1'
+    emb1.metadata_path = 'metadata1'
+
+    # Call the API method to save the configuration to a temporary dir.
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    writer = tf.train.SummaryWriter(temp_dir)
+    tf.contrib.tensorboard.plugins.projector.visualize_embeddings(writer,
+                                                                  config)
+
+    # Read the configuratin from disk and make sure it matches the original.
+    with tf.gfile.GFile(os.path.join(temp_dir, 'projector_config.pbtxt')) as f:
+      config2 = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
+      text_format.Parse(f.read(), config2)
+      self.assertEqual(config, config2)
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
similarity index 62%
rename from tensorflow/core/platform/file_system_test.cc
rename to tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
index 6d30bc47865..150f98aef76 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
@@ -13,19 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/file_system.h"
+syntax = "proto3";
 
-#include "tensorflow/core/platform/test.h"
+package tensorflow;
 
-namespace tensorflow {
-namespace {
-
-TEST(FileSystemTest, GetNameFromURI) {
-  EXPECT_EQ("foo", GetNameFromURI("file://foo"));
-  EXPECT_EQ("file:/", GetNameFromURI("file:/"));
-  EXPECT_EQ("file:", GetNameFromURI("file:"));
-  EXPECT_EQ("bar", GetNameFromURI("bar"));
+message EmbeddingInfo {
+  string tensor_name = 1;
+  string metadata_path = 2;
 }
 
-}  // namespace
-}  // namespace tensorflow
+message ProjectorConfig {
+  string model_checkpoint_path = 1;
+  repeated EmbeddingInfo embedding = 2;
+}
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
new file mode 100644
index 00000000000..0e6420134a5
--- /dev/null
+++ b/tensorflow/contrib/tfprof/README.md
@@ -0,0 +1,453 @@
+# tfprof: A Profiling Tool for TensorFlow Models
+
+go/tfprof
+
+Author: Xin Pan (xpan@google.com, github: panyx0718)
+
+Consultants: Jon Shlens (shlens@google.com), Pete Warden (petewarden@google.com)
+
+[TOC]
+
+## Introduction
+
+tfprof is a profiling tool for TensorFlow that analyzes model architectures
+and measures system performance.
+
+###Major Features
+
+1.  Measure model parameters, float operations, tensor shapes.
+2.  Measure op execution times, requested memory size and device placement.
+3.  Inspect checkpoint tensors' shapes and their values.
+4.  Explore model based on name scope or graph structure.
+5.  Selectively grouping/filtering/accounting/ordering ops.
+
+### Interfaces
+
+[CLI Tutorials](#cli-tutorials):
+It supports interactive mode for exploration and single-shot mode for
+scripts. Outputs can be dumped to files or printed in terminal.
+
+Python API Tutorials: Python API is not released yet.
+
+## CLI Tutorials
+
+Tutorials are based on a 32 layers ResNet.
+TODO(xpan): Provide graph.pbtxt, model.ckpt, tfprof_log and run_meta download.
+
+### Examples
+
+1) Start `tfprof` command line tool
+
+```shell
+# Build the tool.
+bazel build -c opt tensorflow/contrib/tfprof/...
+
+# Help information, including detail 'option' instructions.
+bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof help
+#
+# The following commands will start tfprof interactive mode.
+#
+# Profile model shapes and parameters only.
+bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof \
+    --graph_path=/graph.pbtxt
+#
+# Additionally profile checkpoint statistics and values.
+# Use '-account_type_regexes _checkpoint_variables' to select
+# checkpoint tensors.
+bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt \
+    --checkpoint_path=model.ckpt
+#
+# Additionally profile ops requested memory and timing.
+# See CLI Input Files section on generating run_meta file.
+bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt \
+    --run_meta_path=run_meta \
+    --checkpoint_path=model.ckpt
+#
+# tfprof_log is used to define customized op types and float ops.
+# Use tfprof_logger.write_op_log() to create tfprof_log.
+# See 11) in Examples section on generating tfprof_log file.
+bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof \
+    --graph_path=graph.pbtxt \
+    --run_meta_path=run_meta \
+    --op_log_path=tfprof_log \
+    --checkpoint_path=model.ckpt
+```
+Note that `graph.pbtxt` is an ASCII text format.
+
+2) Press enter to show the default options
+
+```shell
+tfprof>
+tfprof>
+-max_depth                  4
+-min_bytes                  0
+-min_micros                 0
+-min_params                 0
+-min_float_ops              0
+-device_regexes             .*
+-order_by                   name
+-account_type_regexes       Variable
+-start_name_regexes         .*
+-trim_name_regexes
+-show_name_regexes          .*
+-hide_name_regexes          IsVariableInitialized_[0-9]+,save\/.*,^zeros[0-9_]*
+-account_displayed_op_only  false
+# supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
+# [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types]
+-select                     params
+-viz                        false
+-dump_to_file
+```
+
+3) I want to see the `BatchNorm`'s gamma value in checkpoint.
+
+```shell
+# Requires --graph_path, --checkpoint_path.
+tfprof> scope -show_name_regexes unit_1_0.*gamma -select tensor_value -max_depth 5
+_TFProfRoot ()
+  unit_1_0/shared_activation/init_bn/gamma ()
+[1.80 2.10 2.06 1.91 2.26 1.86 1.81 1.37 1.78 1.85 1.96 1.54 2.04 2.34 2.22 1.99 ],
+  unit_1_0/sub2/bn2/gamma ()
+[1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
+```
+
+4) I want to see my checkpoint tensors shape and number of parameters.
+
+```shell
+# Requires --graph_path, --checkpoint_path.
+# Increase -max_depth to see all tensors.
+tfprof> scope -account_type_regexes _checkpoint_variables -select params -max_depth 4
+_TFProfRoot (--/930.58k params)
+  global_step (0/0 params)
+  init/init_conv/DW (3x3x3x16, 432/864 params)
+  pool_logit/DW (64x10, 640/1.28k params)
+    pool_logit/DW/Momentum (64x10, 640/640 params)
+  pool_logit/biases (10, 10/20 params)
+    pool_logit/biases/Momentum (10, 10/10 params)
+  unit_last/final_bn/beta (64, 64/128 params)
+  unit_last/final_bn/gamma (64, 64/128 params)
+  unit_last/final_bn/moving_mean (64, 64/64 params)
+  unit_last/final_bn/moving_variance (64, 64/64 params)
+```
+
+5) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
+it depends on take a long time to run. Hint: Use the ‘graph’ command to explore
+graph dependencies.
+
+```shell
+# Requires --graph_path, --run_meta_path.
+tfprof> graph -start_name_regexes cost.* -max_depth 100 -min_micros 10000 -select micros -account_type_regexes .*
+_TFProfRoot (0us/3.61sec)
+  init/init_conv/Conv2D (11.75ms/3.10sec)
+    random_shuffle_queue_DequeueMany (3.09sec/3.09sec)
+  unit_1_0/sub2/conv2/Conv2D (74.14ms/3.19sec)
+  unit_1_3/sub2/conv2/Conv2D (60.75ms/3.34sec)
+  unit_2_4/sub2/conv2/Conv2D (73.58ms/3.54sec)
+  unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
+```
+
+6) I want to know the expensive operations during the back propagation.
+Hint: tensorflow prepend ‘gradient’ to your defined name scopes. Use the ‘scope’
+command to explore based on name scope hierarchies.
+
+```shell
+# Requires --graph_path, --run_meta_path.
+tfprof> scope -start_name_regexes gradient.* -max_depth 100 -min_micros 20000 -select micros -account_type_regexes .*
+_TFProfRoot (0us/2.29sec)
+  gradients/unit_1_0/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (54.96ms/54.96ms)
+  gradients/unit_1_0/sub2/conv2/Conv2D_grad/Conv2DBackpropFilter (83.63ms/83.63ms)
+  gradients/unit_1_1/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (99.25ms/99.25ms)
+  gradients/unit_1_2/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (95.40ms/95.40ms)
+  gradients/unit_1_2/sub2/conv2/Conv2D_grad/Conv2DBackpropFilter (99.83ms/99.83ms)
+  gradients/unit_1_3/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (95.39ms/95.39ms)
+  ...
+```
+
+7) Show the number of float operations in the model.
+Note: float operations calculation depends on
+1) op.RegisterStatistics. If an op doesn’t
+have RegisterStatistics defined, its float operations cannot be counted.
+2) fully defined shape is also necessary in order to calculate flops.
+float operations number is provided by tensorflow::tfprof::OpLog logged from
+Python API.
+
+```shell
+# Requires --graph_path, --op_log_path.
+tfprof> scope -min_float_ops 1 -max_depth 10 -select float_ops -account_type_regexes .*
+_TFProfRoot (0/17.63b flops)
+  gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul (163.84k/163.84k flops)
+  gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul_1 (163.84k/163.84k flops)
+  init/init_conv/Conv2D (113.25m/113.25m flops)
+  pool_logit/xw_plus_b (1.28k/165.12k flops)
+    pool_logit/xw_plus_b/MatMul (163.84k/163.84k flops)
+  unit_1_0/sub1/conv1/Conv2D (603.98m/603.98m flops)
+  unit_1_0/sub2/conv2/Conv2D (603.98m/603.98m flops)
+  unit_1_1/sub1/conv1/Conv2D (603.98m/603.98m flops)
+  unit_1_1/sub2/conv2/Conv2D (603.98m/603.98m flops)
+  ...
+```
+
+8) Show the number of parameters of all `tf.trainable_variables()` in the model.
+
+```shell
+# Requires --graph_path --op_log_path.
+# store option for future commands.
+tfprof> set -account_type_regexes _trainable_variables
+tfprof> scope -max_depth 4 -select params
+_TFProfRoot (--/464.15k params)
+  init/init_conv/DW (3x3x3x16, 432/432 params)
+  pool_logit/DW (64x10, 640/640 params)
+  pool_logit/biases (10, 10/10 params)
+  unit_last/final_bn/beta (64, 64/64 params)
+  unit_last/final_bn/gamma (64, 64/64 params)
+```
+
+Where does “_trainable_variables” come from? It is from the OpLog file
+generated by write_op_log() Python API. write_op_log() help users create some
+common op types implicitly. Users can define their own op types and log it
+through the write_op_log() API.
+
+9) What if I’m lazy and don’t want to define op type? I have given my ops
+well-defined names in my model’s code. And want to use names to select a group
+of ops. Let’s try it!
+
+```shell
+tfprof> set -account_type_regexes .*
+tfprof> scope -show_name_regexes unit_2_1.*DW -max_depth 100 -account_displayed_op_only
+_TFProfRoot (0/18.43k params)
+  unit_2_1/sub1/conv1/DW (3x3x32x32, 9.22k/9.22k params)
+  unit_2_1/sub2/conv2/DW (3x3x32x32, 9.22k/9.22k params)
+```
+
+The above command allows you to filter ops that match specific names.
+`-account_displayed_op_only` asks tfprof to only account ops displayed
+in terminal. Otherwise, tfprof accounts all ops matched by
+`-account_type_regexes` recursively even if they are hidden due to some
+options such as -max_depth.
+
+10) TensorFlow has built-in op types. For example, built-in op type `Variable`
+seems to include `Variable's` created by your model. However, be careful when
+depending on it because TensorFlow creates extra `Variable` ops implicitly and
+the implicitly created ops can have the same prefix as the `Variable's` you
+defined.
+
+In the following example, extra `Variables` are created and “/Momentum” is
+appended to their names. This might cause you “model capacity” calculation
+to get wrong.
+
+```shell
+tfprof> scope -account_type_regexes Variable -max_depth 4 -select params
+_TFProfRoot (--/930.58k params)
+  global_step (1/1 params)
+  init/init_conv/DW (3x3x3x16, 432/864 params)
+  pool_logit/DW (64x10, 640/1.28k params)
+    pool_logit/DW/Momentum (64x10, 640/640 params)
+  pool_logit/biases (10, 10/20 params)
+    pool_logit/biases/Momentum (10, 10/10 params)
+  unit_last/final_bn/beta (64, 64/128 params)
+  unit_last/final_bn/gamma (64, 64/128 params)
+  unit_last/final_bn/moving_mean (64, 64/64 params)
+  unit_last/final_bn/moving_variance (64, 64/64 params)
+```
+
+
+11) A example of defining extra op type for ops using `OpLog`
+
+First, in Python code, create an `OpLog` proto and add op type
+information to it:
+
+```python
+op_log = tfprof_log_pb2.OpLog()
+entry = op_log.log_entries.add()
+entry.name = 'pool_logit/DW'
+entry.types.append('pool_logit')
+entry = op_log.log_entries.add()
+entry.name = 'pool_logit/biases'
+# Alternatively:
+# var = tf.get_variable(xxx)
+# entry.name = var.op.name
+entry.types.append('pool_logit')
+```
+
+Second, call write_op_log to write the OpLog proto.
+
+```python
+tfprof_logger.write_op_log(sess.graph, /tmp/my_op_log_dir, op_log)
+```
+
+Third, when starting the tfprof tool, specify
+"--op_log_path /tmp/my_op_log_dir/op_log"
+
+```shell
+tfprof> scope -account_type_regexes pool_logit -max_depth 4 -select params
+_TFProfRoot (--/650 params)
+  pool_logit/DW (64x10, 640/640 params)
+  pool_logit/biases (10, 10/10 params)
+```
+
+Note that when you call
+`tfprof_logger.write_op_log(...)`, the tool adds all `Variables` inside
+`tf.trainable_variables()` to `_trainable_variables`.
+
+12) Run tfprof in one-shot mode and dump result to file.
+
+```shell
+# Printed to stdout if --dump_to_file is not set.
+tfprof scope --graph_path /cns/ij-d/home/xpan/tfprof/graph.pbtxt  \
+             --max_depth 3 \
+             --dump_to_file "/tmp/dump"
+Reading Files...
+Parsing GraphDef...
+Preparing Views...
+
+cat /tmp/dump
+_TFProfRoot (--/930.58k params)
+  global_step (0/0 params)
+  pool_logit/DW (64x10, 640/1.28k params)
+  pool_logit/biases (10, 10/20 params)
+```
+
+13) Analyze how balanced Variable are on parameter servers.
+
+In this tutorial, I'm going to use a seq2seq model, which are split
+on several gpus at workers and several parameter servers.
+
+In tfprof, 'device' is an op_type. For example, if op1 and op2 are placed on
+gpu0. They share an op_type called 'gpu0'.
+
+```shell
+bazel-bin/tensorflow/contrib/tfprof/tools/tfprof/tfprof \
+  --graph_path ~/tfprof/textsum/graph.pbtxt  \
+  --run_meta_path ~/tfprof/textsum/run_meta
+
+# Looks like ps task 1 is holding twice more parameters than task 0.
+tfprof> scope -select device,params -account_type_regexes .*ps.*task:0.* -max_depth 1
+_TFProfRoot (--/25.81m params)
+tfprof> scope -select device,params -account_type_regexes .*ps.*task:1.* -max_depth 1
+_TFProfRoot (--/58.84m params)
+```
+
+### CLI Input Files
+
+tfprof command line inference (CLI) loads dumped files from a tensorflow model.
+Convert them into in-memory data structures. To use it, users need to specify
+the locations of the dumped files. The following are the dumped files loaded
+by tfprof:
+
+<b>--graph_path:</b> GraphDef text file (required). Used to build in-memory
+representation of the model. For example, graph.pbtxt written by tf.Supervisor
+is a candidate. If you are not using tf.Supervisor, you can easily get GraphDef
+using tf.Graph.as_graph_def() or other API.
+
+<b>--run_meta_path:</b> tensorflow::RunMetadata.
+Used to get the memory and time consumption of
+each op of the model. Users need to enable it. For example, the following code
+snippet writes a RunMetadata file:
+
+```python
+run_options = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
+run_metadata = config_pb2.RunMetadata()
+# Once a while, call it the get the RunMeta.
+_ = self._sess.run(..., options=run_options, run_metadata=run_metadata)
+with gfile.Open(os.path.join(output_dir, "run_meta"), "w") as f:
+  f.write(run_metadata.SerializeToString())
+```
+
+<b>--op_log_path:</b>
+tensorflow::tfprof::OpLog. A proto used to provide extra op information
+for ops. By giving a group of ops a type name, users can easily aggregate the
+statistics for those ops without accidently missing or including extra ops.
+tfprof exposes the following Python API to add op information and logging.
+
+```python
+  def write_op_log(graph, log_dir, op_log=None)
+```
+
+<b>--checkpoint_path:</b>
+TensorFlow checkpoint. It defines _checkpoint_variable op type. It also
+provides checkpointed tensors' values.
+
+
+## Design
+
+
+### In-memory representation
+
+<b>Scope:</b> This representation organizes ops based on name scope hierarchy,
+similar to filesystem hierarchy. Hence, it is essentially a tree data structure.
+For example op1 with name “name1/name2” is a child of op2 with name “name1”.
+
+<b>Graph:</b> The representation organizes ops based on op inputs. Hence it is
+a graph structure. The graph is a “directed acyclic graph” (hopefully), with
+direction from “output to input”. The direction is design this way so that users
+can trace from “result” to its “sources”.
+
+### Command line options
+
+tfprof’s major goals are to measure system performance and quicly analyze
+model architectures. Hence, its commands and options should allow users to achieve
+these 2 goals easily.
+
+<b>graph:</b> It is expected that users will mostly use graph representation to
+debug system performance. Hence, tfprof supports graph command, which pulls the
+graph in-memory representation described above.
+
+<b>scope:</b> It is expected that some users might want to explore their model
+statistics using the name scope information they defined in the Python codes.
+Hence, tfprof supports “scope” command, which pulls the tree in-memory
+representation.
+
+<b>set:</b> It is used to store the options so that user doesn’t need to
+re-type the same option again and again in the follow up command line. Note that
+tfprof has traditional terminal’s history and auto-complete support.
+
+<b>help:</b> print help information.
+
+<b>Options:</b> Run “tfprof help” to get detailed explanations.
+
+```python
+"-max_depth",
+"-min_bytes",
+"-min_micros",
+"-min_params",
+"-min_float_ops",
+"-order_by",
+"-account_type_regexes",
+"-start_name_regexes",
+"-trim_name_regexes",
+"-show_name_regexes",
+"-hide_name_regexes",
+"-account_displayed_op_only",
+"-select",
+"-viz",  # Only supported for graph command.
+"-dump_to_file",
+```
+
+A key design is that stats are aggregated from descendants up to ancestors.
+`-account_type_regexes` is used to decide which ops stat is accounted. It makes
+decision based on op type. Usually set it to `.*` if no extra type information
+is added to the ops using OpLog. Intuitively, only accounted ops are displayed.
+`-min/max` and `-show/hide/trim/start` options are only used the optionally
+displayed or hide ops based on ops’ name and stats. However, they don’t prevent
+tfprof from accounting stats of hidden ops. Hence, the stat of a op can be
+aggregated by its parent even if it is hidden. `-account_displayed_op_only` is
+an option to break this rule. When it is set, only displayed ops are accounted.
+
+Regexes are all comma-separated, for example `-show_name_regexes`
+`regex1.*,regex2.*`. It is designed this way because it is convenient and comma
+is not expected to show up in op names.
+
+`-order_by` is used to order displayed ops. Displayed ops at the same hierarchy
+(notice the indent printed) are sorted according to order_by.
+
+## Future Work
+
+* Load SummaryWriter event logs so that it can show the latest summary value.
+
+* Better sorting and aggregation of outputs. Easier comprehension.
+
+* Currently, shape information is based on `graph.pbtxt`. When the shape
+information is incomplete, tfprof ignores it. See if it can use `RunMetadata`
+and `Checkpoint` to complete shape information.
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
new file mode 100644
index 00000000000..d78020bbd87
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
@@ -0,0 +1,31 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "tfprof_logger",
+    srcs = ["tfprof_logger.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.  These must be at the end for syncrepo.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/__init__.py b/tensorflow/contrib/tfprof/python/tools/tfprof/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
new file mode 100644
index 00000000000..4a487461a38
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
@@ -0,0 +1,114 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logging tensorflow::tfprof::OpLog.
+
+OpLog is used to add extra model information for offline analysis by tfprof.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+from tensorflow.contrib.tfprof.python.tools.tfprof import tfprof_log_pb2
+from tensorflow.python.framework import ops
+
+TRAINABLE_VARIABLES = '_trainable_variables'
+REGISTERED_FLOP_STATS = 'flops'
+
+
+def _get_logged_ops(graph):
+  """Extract trainable model parameters and FLOPs for ops from a Graph.
+
+  Args:
+    graph: tf.Graph.
+  Returns:
+    logged_ops: dict mapping from op_name to OpLogEntry.
+  """
+  logged_ops = {}
+
+  graph_def = graph.as_graph_def()
+  for node in graph_def.node:
+    try:
+      stats = ops.get_stats_for_node_def(graph, node, REGISTERED_FLOP_STATS)
+    except ValueError:
+      # Catch Exception When shape is incomplete. Skip it.
+      stats = None
+
+    if not stats or not stats.value:
+      continue
+    if node.name not in logged_ops:
+      entry = tfprof_log_pb2.OpLogEntry()
+      entry.name = node.name
+      entry.float_ops = stats.value
+      logged_ops[entry.name] = entry
+
+  for v in graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
+    if v.op.name not in logged_ops:
+      entry = tfprof_log_pb2.OpLogEntry()
+      entry.name = v.op.name
+      entry.types.append(TRAINABLE_VARIABLES)
+      logged_ops[entry.name] = entry
+    else:
+      logged_ops[v.op.name].types.append(TRAINABLE_VARIABLES)
+  return logged_ops
+
+
+def _merge_default_with_oplog(graph, op_log=None):
+  """Merge the tfprof default extra info with caller's op_log.
+
+  Args:
+    graph: tf.Graph.
+    op_log: OpLog proto.
+  Returns:
+    tmp_op_log: Merged OpLog proto.
+  """
+  tmp_op_log = tfprof_log_pb2.OpLog()
+  logged_ops = _get_logged_ops(graph)
+  if not op_log:
+    tmp_op_log.log_entries.extend(logged_ops.values())
+  else:
+    all_ops = dict()
+    for entry in op_log.log_entries:
+      all_ops[entry.name] = entry
+    for op_name, entry in logged_ops.iteritems():
+      if op_name in all_ops:
+        all_ops[op_name].types.extend(entry.types)
+        if entry.float_ops > 0 and all_ops[op_name].float_ops == 0:
+          all_ops[op_name].float_ops = entry.float_ops
+      else:
+        all_ops[op_name] = entry
+    tmp_op_log.log_entries.extend(all_ops.values())
+  return tmp_op_log
+
+
+def write_op_log(graph, log_dir, op_log=None):
+  """Log provided 'op_log', and add additional model information below.
+
+    The API also assigns ops in tf.trainable_variables() an op type called
+    '_trainable_variables'.
+    The API also logs 'flops' statistics for ops with op.RegisterStatistics()
+    defined.
+
+  Args:
+    graph: tf.Graph.
+    log_dir: directory to write the log file.
+    op_log: OpLog proto.
+  """
+  op_log = _merge_default_with_oplog(graph, op_log)
+
+  with tf.gfile.Open(os.path.join(log_dir, 'tfprof_log'), 'w') as log:
+    log.write(op_log.SerializeToString())
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/BUILD b/tensorflow/contrib/tfprof/tools/tfprof/BUILD
new file mode 100644
index 00000000000..da161b1ffa1
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/BUILD
@@ -0,0 +1,52 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.  These must be at the end for syncrepo.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_binary(
+    name = "tfprof",
+    srcs = ["tfprof_main.cc"],
+    deps = [
+        ":protos_all_cc",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/contrib/tfprof/tools/tfprof/internal:tfprof_options",
+        "//tensorflow/contrib/tfprof/tools/tfprof/internal:tfprof_stats",
+        "//tensorflow/contrib/tfprof/tools/tfprof/internal:tfprof_utils",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@linenoise//:linenoise",
+    ],
+)
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+tf_proto_library(
+    name = "protos_all",
+    srcs = glob(
+        ["**/*.proto"],
+    ),
+    cc_api_version = 2,
+    cc_libs = ["//tensorflow/core:protos_all_cc"],
+    go_api_version = 2,
+    java_api_version = 2,
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/BUILD b/tensorflow/contrib/tfprof/tools/tfprof/internal/BUILD
new file mode 100644
index 00000000000..42812b345dc
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/BUILD
@@ -0,0 +1,227 @@
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "tfprof_stats",
+    srcs = ["tfprof_stats.cc"],
+    hdrs = ["tfprof_stats.h"],
+    deps = [
+        ":tfprof_graph",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_scope",
+        ":tfprof_show",
+        ":tfprof_utils",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tfprof_node",
+    srcs = ["tfprof_node.cc"],
+    hdrs = ["tfprof_node.h"],
+    deps = [
+        ":tfprof_options",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tfprof_scope",
+    srcs = ["tfprof_scope.cc"],
+    hdrs = ["tfprof_scope.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_show",
+        ":tfprof_tensor",
+        ":tfprof_utils",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tfprof_graph",
+    srcs = ["tfprof_graph.cc"],
+    hdrs = ["tfprof_graph.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_show",
+        ":tfprof_tensor",
+        ":tfprof_utils",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tfprof_show",
+    srcs = ["tfprof_show.cc"],
+    hdrs = ["tfprof_show.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_tensor",
+        ":tfprof_utils",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "tfprof_show_test",
+    srcs = ["tfprof_show_test.cc"],
+    data = [
+        "testdata/ckpt",
+        "testdata/graph.pbtxt",
+        "testdata/run_meta",
+        "testdata/tfprof_log",
+    ],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_options",
+        ":tfprof_stats",
+        ":tfprof_utils",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "tfprof_utils",
+    srcs = ["tfprof_utils.cc"],
+    hdrs = ["tfprof_utils.h"],
+    deps = [
+        ":tfprof_options",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tfprof_options",
+    srcs = ["tfprof_options.cc"],
+    hdrs = ["tfprof_options.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "print_model_analysis",
+    srcs = ["print_model_analysis.cc"],
+    hdrs = ["print_model_analysis.h"],
+    deps = [
+        ":tfprof_options",
+        ":tfprof_stats",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "tfprof_stats_test",
+    srcs = ["tfprof_stats_test.cc"],
+    data = [
+        "testdata/ckpt",
+        "testdata/graph.pbtxt",
+        "testdata/run_meta",
+        "testdata/tfprof_log",
+    ],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_options",
+        ":tfprof_stats",
+        ":tfprof_utils",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "tfprof_tensor",
+    srcs = ["tfprof_tensor.cc"],
+    hdrs = ["tfprof_tensor.h"],
+    deps = [
+        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "tfprof_tensor_test",
+    srcs = ["tfprof_tensor_test.cc"],
+    data = [
+        "testdata/ckpt",
+        "testdata/graph.pbtxt",
+    ],
+    deps = [
+        ":tfprof_options",
+        ":tfprof_stats",
+        ":tfprof_utils",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/contrib/tfprof/tools/tfprof:protos_all_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "tfprof_constants",
+    hdrs = ["tfprof_constants.h"],
+    deps = [
+    ],
+)
+# -----------------------------------------------------------------------------
+# Google-internal targets.  These must be at the end for syncrepo.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.cc
new file mode 100644
index 00000000000..ab1e47b32dd
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.cc
@@ -0,0 +1,65 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.h"
+
+#include <stdio.h>
+#include <memory>
+#include <utility>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
+
+namespace tensorflow {
+namespace tfprof {
+string PrintModelAnalysis(const string* graph, const string* run_meta,
+                          const string* op_log, const string* command,
+                          const Options* options) {
+  CHECK(graph) << "graph mustn't be null";
+  CHECK(command) << "command mustn't be null";
+  CHECK(options) << "options mustn't be null";
+  std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+  graph_ptr->ParseFromString(*graph);
+
+  std::unique_ptr<RunMetadata> run_meta_ptr;
+  if (run_meta) {
+    run_meta_ptr.reset(new RunMetadata());
+    run_meta_ptr->ParseFromString(*run_meta);
+  }
+
+  std::unique_ptr<OpLog> op_log_ptr;
+  if (op_log) {
+    op_log_ptr.reset(new OpLog());
+    op_log_ptr->ParseFromString(*op_log);
+  }
+
+  std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader;
+
+  TFStats tf_stats(std::move(graph_ptr), std::move(run_meta_ptr),
+                   std::move(op_log_ptr), std::move(ckpt_reader));
+
+  if (options->dump_to_file.empty()) {
+    printf("\n=========================Options=============================\n");
+    printf("%s", options->ToString().c_str());
+    printf("\n==================Model Analysis Report======================\n");
+    TFProfNode root(tf_stats.PrintGraph(*command, *options));
+    printf("\n======================End of Report==========================\n");
+    fflush(stdout);
+    return root.SerializeAsString();
+  }
+  return tf_stats.PrintGraph(*command, *options).SerializeAsString();
+}
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.h b/tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.h
new file mode 100644
index 00000000000..579147f1641
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/print_model_analysis.h
@@ -0,0 +1,45 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+
+#include <string>
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+// ***This API is only for swig.***
+//
+// Interface defined for Python API swig. Calls the tfprof core API.
+// 'graph', 'run_meta', 'op_log' are serialized GraphDef, RunMetadata,
+// OpLog strings, respectively.
+// 'graph', 'command' and 'options' are required. Others can be nullptr
+// if not available.
+string PrintModelAnalysis(const string* graph, const string* run_meta,
+                          const string* op_log, const string* command,
+                          const Options* options);
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_PRINT_MODEL_ANALYSIS_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/ckpt b/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/ckpt
new file mode 100644
index 00000000000..2f59f071c59
Binary files /dev/null and b/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/ckpt differ
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt b/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt
new file mode 100644
index 00000000000..fd54551776c
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt
@@ -0,0 +1,636 @@
+node {
+  name: "zeros"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 6
+          }
+          dim {
+            size: 6
+          }
+          dim {
+            size: 3
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "DW"
+  op: "Variable"
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 3
+        }
+        dim {
+          size: 3
+        }
+        dim {
+          size: 6
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "DW/Initializer/random_normal/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\003\000\000\000\003\000\000\000\003\000\000\000\006\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "DW/Initializer/random_normal/mean"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "DW/Initializer/random_normal/stddev"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "DW/Initializer/random_normal/RandomStandardNormal"
+  op: "RandomStandardNormal"
+  input: "DW/Initializer/random_normal/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 87654321
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 5
+    }
+  }
+}
+node {
+  name: "DW/Initializer/random_normal/mul"
+  op: "Mul"
+  input: "DW/Initializer/random_normal/RandomStandardNormal"
+  input: "DW/Initializer/random_normal/stddev"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW"
+      }
+    }
+  }
+}
+node {
+  name: "DW/Initializer/random_normal"
+  op: "Add"
+  input: "DW/Initializer/random_normal/mul"
+  input: "DW/Initializer/random_normal/mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW"
+      }
+    }
+  }
+}
+node {
+  name: "DW/Assign"
+  op: "Assign"
+  input: "DW"
+  input: "DW/Initializer/random_normal"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW"
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "DW/read"
+  op: "Identity"
+  input: "DW"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW"
+      }
+    }
+  }
+}
+node {
+  name: "Conv2D"
+  op: "Conv2D"
+  input: "zeros"
+  input: "DW/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 2
+        i: 2
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "DW2"
+  op: "Variable"
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 2
+        }
+        dim {
+          size: 2
+        }
+        dim {
+          size: 6
+        }
+        dim {
+          size: 12
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "DW2/Initializer/random_normal/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW2"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\002\000\000\000\002\000\000\000\006\000\000\000\014\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "DW2/Initializer/random_normal/mean"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW2"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "DW2/Initializer/random_normal/stddev"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW2"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "DW2/Initializer/random_normal/RandomStandardNormal"
+  op: "RandomStandardNormal"
+  input: "DW2/Initializer/random_normal/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW2"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 87654321
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 15
+    }
+  }
+}
+node {
+  name: "DW2/Initializer/random_normal/mul"
+  op: "Mul"
+  input: "DW2/Initializer/random_normal/RandomStandardNormal"
+  input: "DW2/Initializer/random_normal/stddev"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW2"
+      }
+    }
+  }
+}
+node {
+  name: "DW2/Initializer/random_normal"
+  op: "Add"
+  input: "DW2/Initializer/random_normal/mul"
+  input: "DW2/Initializer/random_normal/mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW2"
+      }
+    }
+  }
+}
+node {
+  name: "DW2/Assign"
+  op: "Assign"
+  input: "DW2"
+  input: "DW2/Initializer/random_normal"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW2"
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "DW2/read"
+  op: "Identity"
+  input: "DW2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@DW2"
+      }
+    }
+  }
+}
+node {
+  name: "Conv2D_1"
+  op: "Conv2D"
+  input: "Conv2D"
+  input: "DW2/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 2
+        i: 2
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+versions {
+  producer: 13
+}
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/run_meta b/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/run_meta
new file mode 100644
index 00000000000..2d5bb7ddaff
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/run_meta
@@ -0,0 +1,22 @@
+
+�
+�
+%/job:localhost/replica:0/task:0/cpu:0:
+_SOURCE������� (2
+cpuB_SOURCE = NoOp()H塈����a
+zeros������� (2
+cpu:(&"�cpu0������Bzeros = Const()H�������^
+DW������� (2
+cpu:(&"�cpu0ੀ���BDW = Variable()H�������`
+DW2������� (2
+cpu:(&"�	cpu0������BDW2 = Variable()H�������j
+DW/read������� (2
+cpu:(&"�cpu0ੀ���BDW/read = Identity(DW)H�������m
+DW2/read������� (2
+cpu:(&"�	cpu0������BDW2/read = Identity(DW2)H�������s
+Conv2D������� P(U2
+cpu�:(&"�cpu0ી���BConv2D = Conv2D(zeros, DW/read)H�������{
+Conv2D_1������� (2
+cpu�:(&"�cpu0฀���B#Conv2D_1 = Conv2D(Conv2D, DW2/read)H�������6
+_SINK������� (2
+cpuB_SINK = NoOp()H�������
\ No newline at end of file
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/tfprof_log b/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/tfprof_log
new file mode 100644
index 00000000000..c35d4338e97
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/testdata/tfprof_log
@@ -0,0 +1,9 @@
+
+
+Conv2D_1�$
+
+DW2_trainable_variables
+
+DW_trainable_variables
+
+Conv2D�-
\ No newline at end of file
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h
new file mode 100644
index 00000000000..169ebae4a75
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
+
+namespace tensorflow {
+namespace tfprof {
+
+// Op name of root of everything. Aggregates all stats.
+static const char* const kTFProfRoot = "_TFProfRoot";
+// Op type for nodes that doesn't represent a physical node in the
+// TensorFlow model. Only exist as a placehold to aggregate children.
+// For example, kTFProfRoot belongs to this type.
+static const char* const kTFGraphParent = "_TFGraphParent";
+static const char* const kTFScopeParent = "_kTFScopeParent";
+// Op type for tf.trainable_variables().
+static const char* const kTrainableVarType = "_trainable_variables";
+// Op type for tensors in the checkpoint file.
+static const char* const kCkptVarType = "_checkpoint_variables";
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_CONSTANTS_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.cc
new file mode 100644
index 00000000000..287fd78d46c
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.cc
@@ -0,0 +1,222 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.h"
+
+#include <stdio.h>
+#include <utility>
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace tensorflow {
+namespace tfprof {
+GraphNode* TFGraph::CreateParentNode(const string& name) {
+  node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
+  node_defs_.back()->set_name(name);
+  node_defs_.back()->set_op(kTFGraphParent);
+  parent_nodes_[name] =
+      std::unique_ptr<TFNode>(new TFNode(node_defs_.back().get()));
+  nodes_map_[name] =
+      std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get()));
+  return nodes_map_[name].get();
+}
+
+void TFGraph::AddNode(TFNode* node) {
+  string name = node->node_def()->name();
+  nodes_map_[name] = std::unique_ptr<GraphNode>(new GraphNode(node));
+}
+
+void TFGraph::Build() {
+  if (!roots_.empty()) return;
+
+  std::set<string> nonroots;
+  // Filter out the root nodes (node not input of any other node).
+  for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+    GraphNode* node = it->second.get();
+    const std::map<string, TFNode*>& inputs = node->node->inputs();
+    for (auto inputs_it = inputs.cbegin(); inputs_it != inputs.cend();
+         inputs_it++) {
+      nonroots.insert(inputs_it->first);
+      auto child_it = nodes_map_.find(inputs_it->first);
+      if (child_it != nodes_map_.end()) {
+        node->children.push_back(child_it->second.get());
+      }
+    }
+  }
+  for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+    if (nonroots.find(it->first) == nonroots.end()) {
+      roots_.push_back(it->second.get());
+    }
+  }
+}
+
+const ShowNode* TFGraph::ShowInternal(const Options& opts) {
+  // Search the nodes to start from.
+  std::vector<GraphNode*> roots = roots_;
+  if (opts.start_name_regexes.size() != 1 ||
+      opts.start_name_regexes[0] != ".*") {
+    std::set<string> visited;
+    roots = SearchRoot(roots, opts.start_name_regexes, &visited);
+  }
+
+  GraphNode* root = CreateParentNode(kTFProfRoot);
+  root->children.assign(roots.begin(), roots.end());
+
+  std::map<string, int64> account_visits;
+  Account({root}, opts, &account_visits);
+
+  if (opts.viz) {
+    printf("Visualizing feature disabled...\n");
+  }
+  std::set<string> visits;
+  return PrintGraph({root}, opts, 1, 0, 0, &visits)[0];
+}
+
+std::vector<GraphNode*> TFGraph::SearchRoot(
+    const std::vector<GraphNode*>& roots, const std::vector<string>& regexes,
+    std::set<string>* visited) {
+  std::vector<GraphNode*> res;
+  if (roots.empty()) {
+    return res;
+  }
+  for (GraphNode* root : roots) {
+    if (visited->find(root->name()) != visited->end()) continue;
+    visited->insert(root->name());
+    // If the parent is a start point, don't search its children.
+    // Note that its children can still be added as start node through
+    // another route.
+    bool match_start_node = false;
+    for (const string& regex : regexes) {
+      if (RE2::FullMatch(root->name(), regex)) {
+        res.push_back(root);
+        match_start_node = true;
+        break;
+      }
+    }
+    if (match_start_node) {
+      continue;
+    }
+    std::vector<GraphNode*> nroot =
+        SearchRoot(root->children, regexes, visited);
+    res.insert(res.end(), nroot.begin(), nroot.end());
+  }
+  return res;
+}
+
+std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
+                                            const Options& opts, int depth,
+                                            int hidden, int last_ident,
+                                            std::set<string>* visits) {
+  std::vector<GraphNode*> show_nodes;
+
+  for (GraphNode* node : roots) {
+    if (visits->find(node->name()) != visits->end()) continue;
+    visits->insert(node->name());
+
+    int nhidden = hidden;
+    int nlast_ident = last_ident;
+    bool show = ShouldShow(node, opts, depth);
+    if (show) {
+      node->formatted_str.clear();
+      if (opts.account_displayed_op_only) {
+        node->ResetTotalStats();
+        node->AddSelfToTotalStats();
+      }
+      nhidden = 0;
+      nlast_ident = (hidden && opts.select.find(kShown[4]) != opts.select.end()
+                         ? last_ident + 4
+                         : last_ident + 2);
+    } else {
+      ++nhidden;
+    }
+
+    std::vector<GraphNode*> show_cnodes;
+    if (!ShouldTrim(node, opts.trim_name_regexes)) {
+      show_cnodes = PrintGraph(node->children, opts, depth + 1, nhidden,
+                               nlast_ident, visits);
+    }
+    if (show) {
+      show_cnodes = SortNodes(show_cnodes, opts);
+      string children_str;
+      for (GraphNode* sc : show_cnodes) {
+        children_str += sc->formatted_str;
+        node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        if (opts.account_displayed_op_only) {
+          node->AggregateTotalStats(sc);
+        }
+      }
+      if (hidden && opts.select.find(kShown[4]) != opts.select.end()) {
+        node->formatted_str = strings::Printf(
+            "%s...hidden %d...\n", string(last_ident, ' ').c_str(), hidden);
+        node->formatted_str +=
+            strings::Printf("  %s%s\n", string(last_ident, ' ').c_str(),
+                            node->Format(opts).c_str());
+      } else {
+        node->formatted_str =
+            strings::Printf("%s%s\n", string(last_ident, ' ').c_str(),
+                            node->Format(opts).c_str());
+      }
+      if (opts.select.find(kShown[5]) != opts.select.end()) {
+        std::unique_ptr<TFProfTensor> tfprof_tensor;
+        if (LookUpCheckPoint(node->name(), &tfprof_tensor)) {
+          string value_str;
+          tfprof_tensor->Display(&value_str,
+                                 node->mutable_proto()->mutable_tensor_value());
+          node->formatted_str += value_str;
+        }
+      }
+
+      node->formatted_str += children_str;
+      show_nodes.push_back(node);
+    } else {
+      show_nodes.insert(show_nodes.end(), show_cnodes.begin(),
+                        show_cnodes.end());
+    }
+  }
+  return show_nodes;
+}
+
+void TFGraph::Account(const std::vector<GraphNode*>& roots, const Options& opts,
+                      std::map<string, int64>* visits) {
+  if (roots.empty()) return;
+
+  for (GraphNode* node : roots) {
+    if (visits->find(node->name()) != visits->end()) continue;
+    (*visits)[node->name()] = 1;
+    node->ResetTotalStats();
+    // Depth-firsth.
+    Account(node->children, opts, visits);
+
+    node->account = ShouldAccount(node, opts);
+    if (node->account) {
+      node->AddSelfToTotalStats();
+    }
+    // Aggregate its children stats.
+    for (GraphNode* c : node->children) {
+      // A node can be visited from multiple parents. Only account once.
+      // "visits==1" is when the node is visited through depth-first search.
+      (*visits)[c->name()] += 1;
+      if ((*visits)[c->name()] > 2) continue;
+
+      node->AggregateTotalStats(c);
+    }
+  }
+}
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.h b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.h
new file mode 100644
index 00000000000..ee54534f56b
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.h
@@ -0,0 +1,116 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Build a graph structure based on op inputs/outputs. The graph is a directed
+// acyclic graph pointing *from outputs to inputs*.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
+
+#include <deque>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace tfprof {
+class GraphNode : public ShowNode {
+ public:
+  explicit GraphNode(TFNode* node) : ShowNode(node) {
+    mutable_proto()->set_inputs(node->inputs().size());
+    mutable_proto()->set_total_inputs(0);
+  }
+
+  void AggregateTotalStats(GraphNode* node) {
+    ShowNode::AggregateTotalStats(node);
+    mutable_proto()->set_total_inputs(proto().total_inputs() +
+                                      node->proto().total_inputs() + 1);
+  }
+
+  void AddSelfToTotalStats() {
+    ShowNode::AddSelfToTotalStats();
+    mutable_proto()->set_total_inputs(proto().total_inputs() +
+                                      proto().inputs());
+  }
+
+  void ResetTotalStats() {
+    ShowNode::ResetTotalStats();
+    mutable_proto()->set_total_inputs(0);
+  }
+
+  std::vector<GraphNode*> children;
+};
+
+// Organize tensorflow ops in a graph structure, pointing from output ops
+// to input ops.
+class TFGraph : public TFShow {
+ public:
+  explicit TFGraph(checkpoint::CheckpointReader* ckpt_reader)
+      : TFShow(ckpt_reader) {}
+  ~TFGraph() override {}
+
+  void AddNode(TFNode* node) override;
+
+  void Build() override;
+
+ private:
+  const ShowNode* ShowInternal(const Options& opts) override;
+
+  bool ShouldShowIfExtra(ShowNode* node, const Options& opts,
+                         int depth) override {
+    return true;
+  }
+
+  GraphNode* CreateParentNode(const string& name);
+
+  std::vector<GraphNode*> SearchRoot(const std::vector<GraphNode*>& roots,
+                                     const std::vector<string>& regexes,
+                                     std::set<string>* visited);
+
+  std::vector<GraphNode*> PrintGraph(const std::vector<GraphNode*> roots,
+                                     const Options& opts, int depth, int hidden,
+                                     int last_ident, std::set<string>* visits);
+
+  void VisualizeGraph(GraphNode* root, const Options& opts);
+
+  std::vector<GraphNode*> GenerateGraphDot(
+      GraphNode* root, GraphNode* last_shown, const Options& opts, int depth,
+      int hidden, std::set<string>* declared_nodes,
+      std::set<string>* declared_edges, TFProfNode* parent);
+
+  void Account(const std::vector<GraphNode*>& roots, const Options& opts,
+               std::map<string, int64>* visits);
+
+  std::vector<GraphNode*> roots_;
+  std::vector<std::unique_ptr<NodeDef>> node_defs_;
+  std::map<string, std::unique_ptr<TFNode>> parent_nodes_;
+  std::map<string, std::unique_ptr<GraphNode>> nodes_map_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_GRAPH_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.cc
new file mode 100644
index 00000000000..0e8ab366cbb
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.cc
@@ -0,0 +1,47 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h"
+
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+void TFNode::AddStepStat(const string& device, const NodeExecStats* step_stat) {
+  if (!device.empty()) {
+    // This might override device from GraphDef.
+    device_ = device;
+  }
+  step_stat_ = step_stat;
+
+  op_start_micros_ = step_stat_->all_start_micros();
+  if (step_stat_->op_end_rel_micros() && step_stat_->op_start_rel_micros()) {
+    op_exec_micros_ =
+        step_stat_->op_end_rel_micros() - step_stat_->op_start_rel_micros();
+  }
+  all_spent_micros_ = step_stat_->all_end_rel_micros();
+
+  for (const auto& output : step_stat_->output()) {
+    if (output.has_tensor_description() &&
+        output.tensor_description().has_allocation_description()) {
+      requested_bytes_ += output.tensor_description()
+                              .allocation_description()
+                              .requested_bytes();
+    }
+  }
+}
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h
new file mode 100644
index 00000000000..c8a8f5e7ec4
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h
@@ -0,0 +1,106 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFNode {
+ public:
+  TFNode(const NodeDef* node)
+      : node_(node),
+        step_stat_(nullptr),
+        op_start_micros_(0),
+        op_exec_micros_(0),
+        all_spent_micros_(0),
+        requested_bytes_(0),
+        float_ops_(0) {
+    if (!node) return;
+
+    for (const auto& attr : node->attr()) {
+      // TODO(xpan): Also consider _output_shapes.
+      if (attr.first != "shape" || !attr.second.has_shape()) continue;
+      if (!shape_.empty()) {
+        fprintf(stderr, "Found duplicated shapes!\n");
+        continue;
+      }
+      std::vector<int64> shape_vec;
+      for (const auto& d : attr.second.shape().dim()) {
+        shape_vec.push_back(d.size());
+      }
+      update_shape(shape_vec);
+    }
+    op_types_.insert(node->op());
+    device_ = node->device();
+  }
+
+  TFNode() : TFNode(nullptr) {}
+
+  void AddInput(TFNode* input) { inputs_[input->node_def()->name()] = input; }
+
+  void AddOpType(const string& op_type) { op_types_.insert(op_type); }
+
+  void AddStepStat(const string& device, const NodeExecStats* step_stat);
+
+  void AddFloatOps(int64 float_ops) { float_ops_ = float_ops; }
+
+  const NodeDef* node_def() { return node_; }
+  const std::map<string, TFNode*>& inputs() { return inputs_; }
+  int64 op_start_micros() { return op_start_micros_; }
+  int64 op_exec_micros() { return op_exec_micros_; }
+  int64 all_spent_micros() { return all_spent_micros_; }
+  int64 requested_byptes() { return requested_bytes_; }
+  int64 float_ops() { return float_ops_; }
+  string device() { return device_; }
+  const std::set<string>& op_types() { return op_types_; }
+
+  const std::vector<int64>& shape() { return shape_; }
+  void update_shape(const std::vector<int64>& shape) { shape_ = shape; }
+
+ private:
+  std::map<string, TFNode*> inputs_;
+  const NodeDef* node_;
+  const NodeExecStats* step_stat_;
+
+  std::vector<int64> shape_;
+  std::set<string> op_types_;
+  string device_;
+  int64 op_start_micros_;
+  int64 op_exec_micros_;
+  int64 all_spent_micros_;
+  int64 requested_bytes_;
+  int64 float_ops_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.cc
new file mode 100644
index 00000000000..2574415fdd4
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.cc
@@ -0,0 +1,57 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+string Options::ToString() const {
+  const string s = strings::Printf(
+      "%-28s%d\n"
+      "%-28s%lld\n"
+      "%-28s%lld\n"
+      "%-28s%lld\n"
+      "%-28s%lld\n"
+      "%-28s%s\n"
+      "%-28s%s\n"
+      "%-28s%s\n"
+      "%-28s%s\n"
+      "%-28s%s\n"
+      "%-28s%s\n"
+      "%-28s%s\n"
+      "%-28s%s\n"
+      "%-28s%s\n"
+      "%-28s%s\n"
+      "%-28s%s\n",
+      kOptions[0], max_depth, kOptions[1], min_bytes, kOptions[2], min_micros,
+      kOptions[3], min_params, kOptions[4], min_float_ops, kOptions[5],
+      str_util::Join(device_regexes, ",").c_str(), kOptions[6],
+      order_by.c_str(), kOptions[7],
+      str_util::Join(account_type_regexes, ",").c_str(), kOptions[8],
+      str_util::Join(start_name_regexes, ",").c_str(), kOptions[9],
+      str_util::Join(trim_name_regexes, ",").c_str(), kOptions[10],
+      str_util::Join(show_name_regexes, ",").c_str(), kOptions[11],
+      str_util::Join(hide_name_regexes, ",").c_str(), kOptions[12],
+      (account_displayed_op_only ? "true" : "false"), kOptions[13],
+      str_util::Join(select, ",").c_str(), kOptions[14],
+      (viz ? "true" : "false"), kOptions[15], dump_to_file.c_str());
+  return s;
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h
new file mode 100644
index 00000000000..a0c52e6d1af
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h
@@ -0,0 +1,119 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
+
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+namespace tfprof {
+static const char* const kOptions[] = {
+    "-max_depth",
+    "-min_bytes",
+    "-min_micros",
+    "-min_params",
+    "-min_float_ops",
+    "-device_regexes",
+    "-order_by",
+    "-account_type_regexes",
+    "-start_name_regexes",
+    "-trim_name_regexes",
+    "-show_name_regexes",
+    "-hide_name_regexes",
+    "-account_displayed_op_only",
+    "-select",
+    "-viz",
+    "-dump_to_file",
+};
+
+static const char* const kOrderBy[] = {
+    "name", "bytes", "micros", "params", "float_ops",
+};
+
+// Append Only.
+static const char* const kShown[] = {
+    "bytes",          "micros",       "params", "float_ops",
+    "num_hidden_ops", "tensor_value", "device", "op_types",
+};
+
+static const char* const kCmds[] = {
+    "scope", "graph", "set", "help",
+};
+
+struct Options {
+ public:
+  virtual ~Options() {}
+  Options(int max_depth, tensorflow::int64 min_bytes,
+          tensorflow::int64 min_micros, tensorflow::int64 min_params,
+          tensorflow::int64 min_float_ops,
+          const std::vector<string>& device_regexes, const string& order_by,
+          const std::vector<string>& account_type_regexes,
+          const std::vector<string>& start_name_regexes,
+          const std::vector<string>& trim_name_regexes,
+          const std::vector<string>& show_name_regexes,
+          const std::vector<string>& hide_name_regexes,
+          bool account_displayed_op_only, const std::vector<string>& select,
+          bool viz, const string& dump_to_file = "")
+      : max_depth(max_depth),
+        min_bytes(min_bytes),
+        min_micros(min_micros),
+        min_params(min_params),
+        min_float_ops(min_float_ops),
+        device_regexes(device_regexes),
+        order_by(order_by),
+        account_type_regexes(account_type_regexes),
+        start_name_regexes(start_name_regexes),
+        trim_name_regexes(trim_name_regexes),
+        show_name_regexes(show_name_regexes),
+        hide_name_regexes(hide_name_regexes),
+        account_displayed_op_only(account_displayed_op_only),
+        select(select.begin(), select.end()),
+        viz(viz),
+        dump_to_file(dump_to_file) {}
+
+  string ToString() const;
+
+  int max_depth;
+  tensorflow::int64 min_bytes;
+  tensorflow::int64 min_micros;
+  tensorflow::int64 min_params;
+  tensorflow::int64 min_float_ops;
+  std::vector<string> device_regexes;
+  string order_by;
+
+  std::vector<string> account_type_regexes;
+  std::vector<string> start_name_regexes;
+  std::vector<string> trim_name_regexes;
+  std::vector<string> show_name_regexes;
+  std::vector<string> hide_name_regexes;
+  bool account_displayed_op_only;
+
+  std::set<string> select;
+  bool viz;
+  string dump_to_file;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_OPTIONS_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.cc
new file mode 100644
index 00000000000..6b2bc298ccb
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.cc
@@ -0,0 +1,191 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.h"
+
+#include <stdio.h>
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace tensorflow {
+namespace tfprof {
+ScopeNode* TFScope::CreateParentNode(const string& name) {
+  if (nodes_map_.find(name) != nodes_map_.end()) {
+    return nodes_map_[name].get();
+  }
+  node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
+  node_defs_.back()->set_name(name);
+  node_defs_.back()->set_op(kTFScopeParent);
+  parent_nodes_[name] =
+      std::unique_ptr<TFNode>(new TFNode(node_defs_.back().get()));
+  nodes_map_[name] =
+      std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get()));
+  return nodes_map_[name].get();
+}
+
+void TFScope::AddNode(TFNode* node) {
+  string name = node->node_def()->name();
+  if (nodes_map_.find(node->node_def()->name()) == nodes_map_.end()) {
+    nodes_map_[name] = std::unique_ptr<ScopeNode>(new ScopeNode(node));
+  }
+
+  auto last_slash = name.find_last_of("/");
+  while (last_slash != name.npos) {
+    name = name.substr(0, last_slash);
+    if (nodes_map_.find(name) == nodes_map_.end()) {
+      CHECK(CreateParentNode(name));
+    }
+    last_slash = name.find_last_of("/");
+  }
+}
+
+void TFScope::Build() {
+  if (!roots_.empty()) return;
+  // Found roots, which are nodes without "/".
+  for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+    ScopeNode* node = it->second.get();
+    auto last_slash = node->name().find_last_of("/");
+    if (last_slash == string::npos) {
+      roots_.push_back(node);
+    } else {
+      const string prefix = node->name().substr(0, last_slash);
+      nodes_map_[prefix]->children.push_back(node);
+    }
+  }
+}
+
+const ShowNode* TFScope::ShowInternal(const Options& opts) {
+  // Search from roots recursively to find start node, if start_name_regexes
+  // is specified.
+  std::vector<ScopeNode*> roots = roots_;
+  if (opts.start_name_regexes.size() != 1 ||
+      opts.start_name_regexes[0] != ".*") {
+    roots = SearchRoot(roots, opts.start_name_regexes);
+  }
+
+  ScopeNode* root = CreateParentNode(kTFProfRoot);
+  root->children.assign(roots.begin(), roots.end());
+  Account({root}, opts);
+
+  root = PrintScope({root}, opts, 1, 0)[0];
+  return root;
+}
+
+std::vector<ScopeNode*> TFScope::SearchRoot(
+    std::vector<ScopeNode*> roots, const std::vector<string>& regexes) {
+  std::vector<ScopeNode*> res;
+  if (roots.empty()) {
+    return res;
+  }
+  for (ScopeNode* root : roots) {
+    bool match_start_node = false;
+    for (const string& regex : regexes) {
+      if (RE2::FullMatch(root->name(), regex)) {
+        res.push_back(root);
+        match_start_node = true;
+        break;
+      }
+    }
+    if (match_start_node) {
+      // Found a start node at this branch, no need to continue.
+      continue;
+    }
+    std::vector<ScopeNode*> nroots = SearchRoot(root->children, regexes);
+    res.insert(res.end(), nroots.begin(), nroots.end());
+  }
+  return res;
+}
+
+std::vector<ScopeNode*> TFScope::PrintScope(const std::vector<ScopeNode*> roots,
+                                            const Options& opts, int depth,
+                                            int last_ident) {
+  std::vector<ScopeNode*> show_nodes;
+
+  for (ScopeNode* node : roots) {
+    int nlast_ident = last_ident;
+    bool show = ShouldShow(node, opts, depth);
+    if (show) {
+      node->formatted_str.clear();
+      if (opts.account_displayed_op_only) {
+        node->ResetTotalStats();
+        node->AddSelfToTotalStats();
+      }
+      nlast_ident += 2;
+    }
+
+    std::vector<ScopeNode*> show_cnodes;
+    if (!ShouldTrim(node, opts.trim_name_regexes)) {
+      show_cnodes = PrintScope(node->children, opts, depth + 1, nlast_ident);
+    }
+    if (show) {
+      show_cnodes = SortNodes(show_cnodes, opts);
+      string children_str;
+      for (ScopeNode* sc : show_cnodes) {
+        children_str += sc->formatted_str;
+        node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        if (opts.account_displayed_op_only) {
+          node->AggregateTotalStats(sc);
+        }
+      }
+
+      node->formatted_str =
+          strings::Printf("%s%s\n", string(last_ident, ' ').c_str(),
+                          node->Format(opts).c_str());
+
+      if (opts.select.find(kShown[5]) != opts.select.end()) {
+        std::unique_ptr<TFProfTensor> tfprof_tensor;
+        if (LookUpCheckPoint(node->name(), &tfprof_tensor)) {
+          string value_str;
+          tfprof_tensor->Display(&value_str,
+                                 node->mutable_proto()->mutable_tensor_value());
+          node->formatted_str += value_str;
+        }
+      }
+
+      node->formatted_str += children_str;
+      show_nodes.push_back(node);
+    } else {
+      show_nodes.insert(show_nodes.end(), show_cnodes.begin(),
+                        show_cnodes.end());
+    }
+  }
+  return show_nodes;
+}
+
+void TFScope::Account(const std::vector<ScopeNode*>& roots,
+                      const Options& opts) {
+  if (roots.empty()) return;
+
+  for (ScopeNode* node : roots) {
+    node->ResetTotalStats();
+    Account(node->children, opts);
+
+    node->account = ShouldAccount(node, opts);
+    if (node->account) {
+      node->AddSelfToTotalStats();
+    }
+    for (ScopeNode* c : node->children) {
+      node->AggregateTotalStats(c);
+    }
+  }
+}
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.h b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.h
new file mode 100644
index 00000000000..3a8ca52b43c
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.h
@@ -0,0 +1,88 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Build a tree structure based on the TensorFlow op names.
+// For example, 'name1/name2' is a child of 'name1'.
+// Stats are aggregated from descendants from ancestors.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class ScopeNode : public ShowNode {
+ public:
+  explicit ScopeNode(TFNode* node) : ShowNode(node) {}
+  ~ScopeNode() override {}
+
+  void AggregateTotalStats(ScopeNode* node) {
+    ShowNode::AggregateTotalStats(node);
+  }
+
+  void AddSelfToTotalStats() { ShowNode::AddSelfToTotalStats(); }
+
+  void ResetTotalStats() { ShowNode::ResetTotalStats(); }
+
+  std::vector<ScopeNode*> children;
+};
+
+class TFScope : public TFShow {
+ public:
+  explicit TFScope(checkpoint::CheckpointReader* ckpt_reader)
+      : TFShow(ckpt_reader) {}
+  ~TFScope() override {}
+
+  void AddNode(TFNode* node) override;
+
+  void Build() override;
+
+ private:
+  const ShowNode* ShowInternal(const Options& opts) override;
+
+  ScopeNode* CreateParentNode(const string& name);
+
+  std::vector<ScopeNode*> SearchRoot(std::vector<ScopeNode*> roots,
+                                     const std::vector<string>& regexes);
+
+  std::vector<ScopeNode*> PrintScope(const std::vector<ScopeNode*> roots,
+                                     const Options& opts, int depth,
+                                     int last_ident);
+
+  void Account(const std::vector<ScopeNode*>& roots, const Options& opts);
+
+  std::vector<ScopeNode*> roots_;
+  std::vector<std::unique_ptr<NodeDef>> node_defs_;
+  std::map<string, std::unique_ptr<TFNode>> parent_nodes_;
+  std::map<string, std::unique_ptr<ScopeNode>> nodes_map_;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SCOPE_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.cc
new file mode 100644
index 00000000000..f7275d8ae4d
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.cc
@@ -0,0 +1,266 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h"
+
+#include <memory>
+#include <set>
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace tensorflow {
+namespace tfprof {
+ShowNode::ShowNode(TFNode* node) : node(node), account(true) {
+  mutable_proto()->set_name(name());
+  if (!node->device().empty()) {
+    mutable_proto()->set_device(node->device());
+  }
+  mutable_proto()->set_exec_micros(node->op_exec_micros());
+  mutable_proto()->set_requested_bytes(node->requested_byptes());
+  mutable_proto()->set_float_ops(node->float_ops());
+
+  if (!node->shape().empty()) {
+    int64 params = 1;
+    bool complete_shape = true;
+    for (int64 d : node->shape()) {
+      // Sometimes parameters could be <0 when a dim is unknown.
+      if (d < 0) {
+        complete_shape = false;
+        break;
+      }
+      params *= d;
+    }
+    if (complete_shape) {
+      mutable_proto()->set_parameters(proto_.parameters() + params);
+    } else {
+      fprintf(stderr, "Incomplete shape.");
+    }
+  }
+}
+
+string ShowNode::Format(const Options& opts) {
+  if (opts.select.empty()) {
+    return name();
+  }
+  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
+}
+
+string ShowNode::FormatMeta(const Options& opts) {
+  std::vector<string> info;
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    const string shape = FormatShapes(node->shape());
+    if (!shape.empty()) {
+      info.push_back(shape);
+    }
+    string params = FormatNumber(proto().total_parameters()) + " params";
+    if (account) {
+      params = FormatNumber(proto().parameters()) + "/" + params;
+    } else {
+      params = "--/" + params;
+    }
+    info.push_back(params);
+  }
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    string fops = FormatNumber(proto().total_float_ops()) + " flops";
+    if (account) {
+      fops = FormatNumber(proto().float_ops()) + "/" + fops;
+    } else {
+      fops = "--/" + fops;
+    }
+    info.push_back(fops);
+  }
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    string memory = FormatMemory(proto().total_requested_bytes());
+    if (account) {
+      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
+
+    } else {
+      memory = "--/" + memory;
+    }
+    info.push_back(memory);
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    string time = FormatTime(proto().total_exec_micros());
+    if (account) {
+      time = FormatTime(proto().exec_micros()) + "/" + time;
+    } else {
+      time = "--/" + time;
+    }
+    info.push_back(time);
+  }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    if (!proto().device().empty()) {
+      info.push_back(proto().device());
+    }
+  }
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    std::set<string> op_types = node->op_types();
+    // Device is considered a type.
+    if (!proto().device().empty()) {
+      op_types.insert(proto().device());
+    }
+    info.push_back(str_util::Join(op_types, "|"));
+  }
+  return str_util::Join(info, ", ");
+}
+
+TFProfNode* ShowNode::mutable_proto() { return &proto_; }
+
+const TFProfNode& ShowNode::proto() const { return proto_; }
+
+void ShowNode::AggregateTotalStats(ShowNode* node) {
+  TFProfNode* node_pb = node->mutable_proto();
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         node_pb->total_exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             node_pb->total_requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        node_pb->total_parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       node_pb->total_float_ops());
+}
+
+void ShowNode::AddSelfToTotalStats() {
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         proto().exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             proto().requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        proto().parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       proto().float_ops());
+}
+
+void ShowNode::ResetTotalStats() {
+  mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_requested_bytes(0);
+  mutable_proto()->set_total_parameters(0);
+  mutable_proto()->set_total_float_ops(0);
+}
+
+const TFProfNode& TFShow::Show(const Options& opts) {
+  const ShowNode* root = ShowInternal(opts);
+  if (opts.dump_to_file.empty()) {
+    printf("%s", root->formatted_str.c_str());
+    fflush(stdout);
+  } else {
+    Status s = WriteStringToFile(Env::Default(), opts.dump_to_file,
+                                 root->formatted_str);
+    if (!s.ok()) {
+      fprintf(stderr, "%s\n", s.ToString().c_str());
+    }
+  }
+  return root->proto();
+}
+
+bool TFShow::LookUpCheckPoint(const string& name,
+                              std::unique_ptr<TFProfTensor>* tensor) {
+  if (name == kTFProfRoot || !ckpt_reader_ || !tensor) {
+    return false;
+  }
+  std::unique_ptr<Tensor> out_tensor;
+  TF_Status* status = TF_NewStatus();
+  ckpt_reader_->GetTensor(name, &out_tensor, status);
+  if (TF_GetCode(status) != TF_OK) {
+    fprintf(stderr, "%s\n", TF_Message(status));
+    TF_DeleteStatus(status);
+    return false;
+  }
+  tensor->reset(new TFProfTensor(std::move(out_tensor)));
+  TF_DeleteStatus(status);
+  return true;
+}
+
+bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
+  // Always show kTFProfRoot.
+  if (node->name() == kTFProfRoot) return true;
+
+  if (!node->account) return false;
+
+  if (node->proto().requested_bytes() < opts.min_bytes ||
+      node->proto().exec_micros() < opts.min_micros ||
+      node->proto().parameters() < opts.min_params ||
+      node->proto().float_ops() < opts.min_float_ops ||
+      depth > opts.max_depth || !ShouldShowIfExtra(node, opts, depth)) {
+    return false;
+  }
+
+  bool show = false;
+  if (opts.device_regexes.size() == 1 && opts.device_regexes[0] == ".*") {
+    show = true;
+  } else {
+    for (const string& regex : opts.device_regexes) {
+      if (RE2::FullMatch(node->proto().device(), regex)) {
+        show = true;
+        break;
+      }
+    }
+  }
+  // Don't show if device_regexes don't cover it.
+  if (!show) return false;
+
+  show = false;
+  if (opts.show_name_regexes.size() == 1 && opts.show_name_regexes[0] == ".*") {
+    show = true;
+  } else {
+    for (const string& regex : opts.show_name_regexes) {
+      if (RE2::FullMatch(node->name(), regex)) {
+        show = true;
+        break;
+      }
+    }
+  }
+  // Don't show if show_name_regexes don't cover it.
+  if (!show) return false;
+  // Don't show if hide_name_regexes cover it.
+  for (const string& regex : opts.hide_name_regexes) {
+    if (RE2::FullMatch(node->name(), regex)) return false;
+  }
+  return true;
+}
+
+bool TFShow::ShouldTrim(ShowNode* node, const std::vector<string>& regexes) {
+  for (const string& regex : regexes) {
+    if (RE2::FullMatch(node->name(), regex)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool TFShow::ShouldAccount(ShowNode* node, const Options& opts) {
+  if (opts.account_type_regexes.size() == 1 &&
+      opts.account_type_regexes[0] == ".*") {
+    return true;
+  }
+  for (const string& regex : opts.account_type_regexes) {
+    for (const string& type : node->node->op_types()) {
+      if (RE2::FullMatch(type, regex)) {
+        return true;
+      }
+    }
+    if (RE2::FullMatch(node->proto().device(), regex)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h
new file mode 100644
index 00000000000..4b5d6592e5a
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h
@@ -0,0 +1,127 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Parent class and utilities for tfprof_graph and tfprof_scope.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace tfprof {
+class ShowNode {
+ public:
+  explicit ShowNode(TFNode* node);
+  virtual ~ShowNode() {}
+
+  const string& name() const { return node->node_def()->name(); }
+  TFProfNode* mutable_proto();
+  const TFProfNode& proto() const;
+
+  string Format(const Options& opts);
+
+  string FormatMeta(const Options& opts);
+
+  TFNode* node;
+  bool account;
+  string formatted_str;
+
+ protected:
+  void AggregateTotalStats(ShowNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  TFProfNode proto_;
+};
+
+class TFShow {
+ public:
+  explicit TFShow(checkpoint::CheckpointReader* ckpt_reader)
+      : ckpt_reader_(ckpt_reader) {}
+  virtual ~TFShow() {}
+  virtual void AddNode(TFNode* node) = 0;
+  virtual void Build() = 0;
+  const TFProfNode& Show(const Options& opts);
+
+ protected:
+  virtual const ShowNode* ShowInternal(const Options& opts) = 0;
+
+  bool LookUpCheckPoint(const string& name,
+                        std::unique_ptr<TFProfTensor>* tensor);
+
+  // Overridden by subclass if extra requirements need to be met.
+  virtual bool ShouldShowIfExtra(ShowNode* node, const Options& opts,
+                                 int depth) {
+    return true;
+  }
+
+  bool ShouldShow(ShowNode* node, const Options& opts, int depth);
+
+  bool ShouldTrim(ShowNode* node, const std::vector<string>& regexes);
+
+  bool ShouldAccount(ShowNode* node, const Options& opts);
+
+  template <typename T>
+  std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
+    if (opts.order_by.empty() || nodes.empty()) {
+      return nodes;
+    }
+    std::vector<T*> sorted_nodes = nodes;
+    std::sort(sorted_nodes.begin(), sorted_nodes.end(), [&opts](const T* n1,
+                                                                const T* n2) {
+      if (n1->name() == kTFProfRoot) return true;
+      if (n2->name() == kTFProfRoot) return false;
+      bool name_cmp = n1->name() < n2->name();
+      if (opts.order_by == kOrderBy[0]) {
+        return name_cmp;
+      } else if (opts.order_by == kOrderBy[1]) {
+        return n1->proto().total_requested_bytes() >
+               n2->proto().total_requested_bytes();
+      } else if (opts.order_by == kOrderBy[2]) {
+        return n1->proto().total_exec_micros() >
+               n2->proto().total_exec_micros();
+      } else if (opts.order_by == kOrderBy[3]) {
+        return n1->proto().total_parameters() > n2->proto().total_parameters();
+      } else if (opts.order_by == kOrderBy[4]) {
+        return n1->proto().total_float_ops() > n2->proto().total_float_ops();
+      }
+      return name_cmp;
+    });
+    return sorted_nodes;
+  }
+
+  checkpoint::CheckpointReader* ckpt_reader_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show_test.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show_test.cc
new file mode 100644
index 00000000000..81396e31cca
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
+
+#include <utility>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+class TFProfShowTest : public ::testing::Test {
+ protected:
+  TFProfShowTest() {
+    string graph_path = io::JoinPath(
+        testing::TensorFlowSrcRoot(),
+        "contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt");
+    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    TF_CHECK_OK(ReadGraphDefText(Env::Default(), graph_path, graph_pb.get()));
+
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
+        new tensorflow::RunMetadata());
+    string run_meta_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "contrib/tfprof/tools/tfprof/internal/testdata/run_meta");
+    TF_CHECK_OK(
+        ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
+
+    std::unique_ptr<OpLog> op_log_pb(new OpLog());
+    string op_log_path = io::JoinPath(
+        testing::TensorFlowSrcRoot(),
+        "contrib/tfprof/tools/tfprof/internal/testdata/tfprof_log");
+    TF_CHECK_OK(ReadBinaryProto(Env::Default(), op_log_path, op_log_pb.get()));
+
+    string ckpt_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "contrib/tfprof/tools/tfprof/internal/testdata/ckpt");
+    TF_Status* status = TF_NewStatus();
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
+        new checkpoint::CheckpointReader(ckpt_path, status));
+    CHECK(TF_GetCode(status) == TF_OK);
+    TF_DeleteStatus(status);
+
+    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
+                                std::move(op_log_pb), std::move(ckpt_reader)));
+  }
+
+  std::unique_ptr<TFStats> tf_stats_;
+};
+
+TEST_F(TFProfShowTest, DumpScopeMode) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(5, 0, 0, 0, 0, {".*"}, "name",
+               {"Variable"},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
+               false, dump_file);
+  tf_stats_->PrintGraph("scope", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(
+      "_TFProfRoot (--/450 params, --/0 flops, --/1.80KB, --/0us)\n  DW "
+      "(3x3x3x6, 162/162 params, 0/0 flops, 648B/648B, 0us/0us)\n  DW2 "
+      "(2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/1.15KB, 0us/0us)\n",
+      dump_str);
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.cc
new file mode 100644
index 00000000000..54fce4772bd
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.cc
@@ -0,0 +1,130 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
+
+#include <stdio.h>
+#include <utility>
+
+#include "tensorflow/core/framework/step_stats.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+TFStats::TFStats(std::unique_ptr<GraphDef> graph,
+                 std::unique_ptr<RunMetadata> run_meta,
+                 std::unique_ptr<OpLog> op_log,
+                 std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader)
+    : graph_(std::move(graph)),
+      run_meta_(std::move(run_meta)),
+      op_log_(std::move(op_log)),
+      ckpt_reader_(std::move(ckpt_reader)) {
+  CHECK(graph_) << "Must at least have GraphDef";
+
+  printf("Parsing GraphDef...\n");
+  ParseGraph();
+  if (run_meta_) {
+    printf("Parsing RunMetadata...\n");
+    ParseRunMeta();
+  }
+  if (op_log_) {
+    printf("Parsing OpLog...\n");
+    ParseOpLog();
+  }
+
+  if (ckpt_reader_) {
+    printf("Parsing Checkpoint...\n");
+    for (const auto& v : ckpt_reader_->GetVariableToShapeMap()) {
+      auto node = nodes_map_.find(v.first);
+      if (node != nodes_map_.end()) {
+        node->second.AddOpType("_checkpoint_variables");
+      }
+    }
+  }
+
+  printf("Preparing Views...\n");
+  scope_view_ = std::unique_ptr<TFScope>(new TFScope(ckpt_reader_.get()));
+  graph_view_ = std::unique_ptr<TFGraph>(new TFGraph(ckpt_reader_.get()));
+  for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+    scope_view_->AddNode(&it->second);
+    graph_view_->AddNode(&it->second);
+  }
+  scope_view_->Build();
+  graph_view_->Build();
+}
+
+const TFProfNode& TFStats::PrintGraph(const string& cmd, const Options& opts) {
+  if (cmd == kCmds[0]) {
+    return scope_view_->Show(opts);
+  } else if (cmd == kCmds[1]) {
+    return graph_view_->Show(opts);
+  } else {
+    fprintf(stderr, "Unknown command: %s\n", cmd.c_str());
+    return empty_node_;
+  }
+}
+
+void TFStats::ParseGraph() {
+  for (const NodeDef& node : graph_->node()) {
+    CHECK(nodes_map_.find(node.name()) == nodes_map_.end());
+    nodes_map_[node.name()] = TFNode(&node);
+  }
+  for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+    const NodeDef* node_def = it->second.node_def();
+    for (string node_input : node_def->input()) {
+      // input name format can be: "^node:src_output"
+      auto prefix_pos = node_input.find(":");
+      if (prefix_pos != node_input.npos) {
+        node_input.substr(0, prefix_pos);
+      }
+      if (node_input.substr(0, 1) == "^") {
+        node_input = node_input.substr(1);
+      }
+      auto input_node = nodes_map_.find(node_input);
+      if (input_node == nodes_map_.end()) {
+        continue;
+      }
+      it->second.AddInput(&input_node->second);
+    }
+  }
+}
+
+void TFStats::ParseOpLog() {
+  for (const OpLogEntry& entry : op_log_->log_entries()) {
+    auto node = nodes_map_.find(entry.name());
+    if (node == nodes_map_.end()) continue;
+    for (const string& type : entry.types()) {
+      node->second.AddOpType(type);
+    }
+    if (entry.float_ops()) {
+      node->second.AddFloatOps(entry.float_ops());
+    }
+  }
+}
+
+void TFStats::ParseRunMeta() {
+  if (!run_meta_->has_step_stats()) return;
+
+  for (const auto& dev_stat : run_meta_->step_stats().dev_stats()) {
+    for (const auto& node_stat : dev_stat.node_stats()) {
+      auto node = nodes_map_.find(node_stat.node_name());
+      if (node == nodes_map_.end()) {
+        continue;
+      }
+      node->second.AddStepStat(dev_stat.device(), &node_stat);
+    }
+  }
+}
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h
new file mode 100644
index 00000000000..1246a2fae2f
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h
@@ -0,0 +1,82 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Core API of tfprof.
+// 1. Load protos generated from a tensorflow model.
+// 2. Build in-memory representations of the tensorflow model, annotate the
+//    representation with various stats, such as params,times,memory,etc.
+// 3. Accept command and options to selectively aggregate stats for analysis
+//    and print out the results.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_graph.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_scope.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_show.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFStats {
+ public:
+  TFStats(std::unique_ptr<GraphDef> graph,
+          std::unique_ptr<RunMetadata> run_meta, std::unique_ptr<OpLog> op_log,
+          std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader);
+  ~TFStats() {}
+
+  // Prints the results to stdout. Also returns the printed output in
+  // a proto.
+  const TFProfNode& PrintGraph(const string& cmd, const Options& opts);
+
+ private:
+  void ParseGraph();
+
+  void ParseOpLog();
+
+  void ParseRunMeta();
+
+  std::unique_ptr<TFScope> scope_view_;
+  std::unique_ptr<TFGraph> graph_view_;
+  std::unique_ptr<GraphDef> graph_;
+  std::unique_ptr<RunMetadata> run_meta_;
+  std::unique_ptr<OpLog> op_log_;
+  std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader_;
+  // Store TFNode instead of TFNode* to avoid large number of dynamic alloc.
+  std::map<string, TFNode> nodes_map_;
+  TFProfNode empty_node_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_STATS_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats_test.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats_test.cc
new file mode 100644
index 00000000000..06b288fdce7
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats_test.cc
@@ -0,0 +1,194 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
+
+#include <utility>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+class TFProfStatsTest : public ::testing::Test {
+ protected:
+  TFProfStatsTest() {
+    string graph_path = io::JoinPath(
+        testing::TensorFlowSrcRoot(),
+        "contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt");
+    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    TF_CHECK_OK(ReadGraphDefText(Env::Default(), graph_path, graph_pb.get()));
+
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
+        new tensorflow::RunMetadata());
+    string run_meta_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "contrib/tfprof/tools/tfprof/internal/testdata/run_meta");
+    TF_CHECK_OK(
+        ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
+
+    std::unique_ptr<OpLog> op_log_pb(new OpLog());
+    string op_log_path = io::JoinPath(
+        testing::TensorFlowSrcRoot(),
+        "contrib/tfprof/tools/tfprof/internal/testdata/tfprof_log");
+    TF_CHECK_OK(ReadBinaryProto(Env::Default(), op_log_path, op_log_pb.get()));
+
+    string ckpt_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "contrib/tfprof/tools/tfprof/internal/testdata/ckpt");
+    TF_Status* status = TF_NewStatus();
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
+        new checkpoint::CheckpointReader(ckpt_path, status));
+    CHECK(TF_GetCode(status) == TF_OK);
+    TF_DeleteStatus(status);
+
+    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
+                                std::move(op_log_pb), std::move(ckpt_reader)));
+  }
+
+  std::unique_ptr<TFStats> tf_stats_;
+};
+
+TEST_F(TFProfStatsTest, CustomOpType) {
+  Options opts(3, 0, 0, 0, 0, {".*"}, "name",
+               {kTrainableVarType},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
+               false);
+  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+
+  TFProfNode expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 1800\ntotal_parameters: "
+      "450\nchildren {\n  name: \"DW\"\n  exec_micros: 0\n  requested_bytes: "
+      "648\n  parameters: 162\n  total_exec_micros: 0\n  "
+      "total_requested_bytes: 648\n  total_parameters: 162\n  device: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
+      "total_float_ops: 0\n}\nchildren {\n  name: \"DW2\"\n  exec_micros: 0\n  "
+      "requested_bytes: 1152\n  parameters: 288\n  total_exec_micros: 0\n  "
+      "total_requested_bytes: 1152\n  total_parameters: 288\n  device: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
+      "total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+TEST_F(TFProfStatsTest, CheckPointOpType) {
+  Options opts(
+      3, 0, 0, 0, 0, {".*"}, "name", {kCkptVarType},  // accout_type_regexes
+      {".*"}, {""}, {".*"}, {""}, false,
+      {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, false);
+  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+
+  TFProfNode expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 1800\ntotal_parameters: "
+      "450\nchildren {\n  name: \"DW\"\n  exec_micros: 0\n  requested_bytes: "
+      "648\n  parameters: 162\n  total_exec_micros: 0\n  "
+      "total_requested_bytes: 648\n  total_parameters: 162\n  device: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
+      "total_float_ops: 0\n}\nchildren {\n  name: \"DW2\"\n  exec_micros: 0\n  "
+      "requested_bytes: 1152\n  parameters: 288\n  total_exec_micros: 0\n  "
+      "total_requested_bytes: 1152\n  total_parameters: 288\n  device: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
+      "total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+TEST_F(TFProfStatsTest, TestGraph) {
+  Options opts(100, 0, 10000, 0, 0, {".*"}, "name", {".*"},
+               {"cost.*"},  // start_name_regexes
+               {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
+               false);
+  const TFProfNode& root = tf_stats_->PrintGraph("graph", opts);
+
+  TFProfNode expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: 0\ninputs: "
+      "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
+      "0\ntotal_inputs: 0\nfloat_ops: 0\ntotal_float_ops: 0\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+TEST_F(TFProfStatsTest, TestFloatOps) {
+  Options opts(10, 0, 0, 0, 1, {".*"}, "name", {".*"}, {".*"}, {""}, {".*"},
+               {""}, false, {"float_ops"}, false);
+  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+
+  TFProfNode expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 11\ntotal_requested_bytes: "
+      "5280\ntotal_parameters: 450\nchildren {\n  name: \"Conv2D\"\n  "
+      "exec_micros: 0\n  requested_bytes: 432\n  total_exec_micros: 0\n  "
+      "total_requested_bytes: 432\n  total_parameters: 0\n  device: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 5832\n  "
+      "total_float_ops: 5832\n}\nchildren {\n  name: \"Conv2D_1\"\n  "
+      "exec_micros: 10\n  requested_bytes: 384\n  total_exec_micros: 10\n  "
+      "total_requested_bytes: 384\n  total_parameters: 0\n  device: "
+      "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 4608\n  "
+      "total_float_ops: 4608\n}\nfloat_ops: 0\ntotal_float_ops: 10440\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
+  Options opts(100, 0, 0, 0, 0, {".*"}, "name", {".*"}, {".*"}, {""},
+               {"unit_2_1.*DW"},  // show_name_regexes.
+               {""}, true,        // account_displayed_op_only.
+               {"params"}, false);
+  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+
+  TFProfNode expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
+      "0\nfloat_ops: 0\ntotal_float_ops: 0\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+TEST_F(TFProfStatsTest, TestShowTensorValue) {
+  Options opts(10, 0, 0, 0, 0, {".*"}, "name", {".*"}, {".*"}, {""},
+               {"unit_1_0.*gamma"}, {""}, false,
+               {"tensor_value"},  // Show tensor value from checkpoint.
+               false);
+  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+  TFProfNode expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 11\ntotal_requested_bytes: "
+      "5280\ntotal_parameters: 450\nfloat_ops: 0\ntotal_float_ops: 10440\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.cc
new file mode 100644
index 00000000000..c21626919fa
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.cc
@@ -0,0 +1,78 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h"
+
+namespace tensorflow {
+namespace tfprof {
+void TFProfTensor::Display(string* formatted_str,
+                           TFProfTensorProto* tfprof_tensor_pb) {
+  if (formatted_str) {
+    if (formatted_str_.length() >= kTFProfTenosrMaxDisplayLen) {
+      *formatted_str =
+          strings::StrCat(formatted_str_, "...omitted from display\n\n");
+    } else {
+      *formatted_str = formatted_str_;
+    }
+  }
+  if (tfprof_tensor_pb) {
+    tfprof_tensor_pb->MergeFrom(tfprof_tensor_pb_);
+  }
+}
+
+void TFProfTensor::Build() {
+  tfprof_tensor_pb_.set_dtype(tensor_->dtype());
+
+  switch (tensor_->dtype()) {
+    // Double for all floats.
+    case DataType::DT_FLOAT:
+    case DataType::DT_DOUBLE: {
+      std::vector<double> values_vec;
+      if (tensor_->dtype() == DataType::DT_FLOAT) {
+        GetValueVec<float, double>(&values_vec);
+      } else if (tensor_->dtype() == DataType::DT_DOUBLE) {
+        GetValueVec<double, double>(&values_vec);
+      }
+      BuildOutput<double>(0, 0, values_vec, &tfprof_tensor_pb_);
+      break;
+    }
+    // Int64 for all integers.
+    case DataType::DT_INT32:
+    case DataType::DT_INT64: {
+      std::vector<int64> values_vec;
+      if (tensor_->dtype() == DataType::DT_INT32) {
+        GetValueVec<int32, int64>(&values_vec);
+      } else if (tensor_->dtype() == DataType::DT_INT64) {
+        GetValueVec<int64, int64>(&values_vec);
+      }
+      BuildOutput<int64>(0, 0, values_vec, &tfprof_tensor_pb_);
+      break;
+    }
+    case DataType::DT_STRING: {
+      // Not supported by TensorFlow.
+      std::vector<string> values_vec;
+      GetValueVec<string, string>(&values_vec);
+      BuildOutput<string>(0, 0, values_vec, &tfprof_tensor_pb_);
+      break;
+    }
+    default: {
+      fprintf(stderr, "Not Supported type %d\n", tensor_->dtype());
+      break;
+    }
+  }
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h
new file mode 100644
index 00000000000..471a1db4172
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor.h
@@ -0,0 +1,120 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TFProf representation of a Tensor's value.
+// 1. Multi-dimension tensor is flattened in row major, and stored in proto.
+// 2. integer are up-casted to int64. floats are up-casted to double. string
+//    is not supported by TensorFlow CheckPointReader library, though it is
+//    supported in current code.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
+
+#include <typeinfo>
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFProfTensor {
+ public:
+  explicit TFProfTensor(std::unique_ptr<Tensor> tensor)
+      : tensor_(std::move(tensor)) {
+    Build();
+  }
+
+  // If pointers are provided, they are filled by the method.
+  void Display(string* formatted_str, TFProfTensorProto* tfprof_tensor_pb);
+
+ private:
+  // Max length of tensor value displayed to CLI.
+  const int64 kTFProfTenosrMaxDisplayLen = 10000;
+  // Max length after which a latency warning will be printed.
+  const int64 kTFProfTensorMaxWarnLen = 100000;
+
+  void Build();
+
+  // It assumes the flatten values are stored in row-major, which is mentioned
+  // indirectly at various places:
+  // TODO(xpan): Further verifying it.
+  template <typename T>
+  int64 BuildOutput(int64 start, int depth, const std::vector<T>& values,
+                    TFProfTensorProto* dim) {
+    formatted_str_ += "[";
+    int64 nstart = start;
+    for (int i = 0; i < tensor_->dim_size(depth); i++) {
+      // Last dimension, pull the values.
+      if (depth == tensor_->dims() - 1) {
+        std::ostringstream sstream;
+        sstream << values[nstart];
+
+        if (typeid(values[nstart]) == typeid(double)) {
+          double double_val;
+          CHECK(strings::safe_strtod(sstream.str().c_str(), &double_val));
+          dim->add_value_double(double_val);
+          formatted_str_ += strings::Printf(
+              "%.2f ", dim->value_double(dim->value_double_size() - 1));
+        } else if (typeid(values[nstart]) == typeid(int64)) {
+          int64 int64_val;
+          CHECK(strings::safe_strto64(sstream.str().c_str(), &int64_val));
+          dim->add_value_int64(int64_val);
+          formatted_str_ += strings::Printf(
+              "%lld ", dim->value_int64(dim->value_int64_size() - 1));
+        } else if (typeid(values[nstart]) == typeid(string)) {
+          dim->add_value_str(sstream.str());
+          formatted_str_ =
+              strings::StrCat(formatted_str_, "'",
+                              dim->value_str(dim->value_str_size() - 1) + "' ");
+        } else {
+          CHECK(false) << "Unsupported type: " << typeid(values[nstart]).name();
+        }
+        ++nstart;
+      } else {
+        // Not-last dimension. Drill deeper.
+        nstart = BuildOutput<T>(nstart, depth + 1, values, dim);
+      }
+    }
+    if (formatted_str_.length() > kTFProfTenosrMaxDisplayLen) {
+      formatted_str_ = formatted_str_.substr(0, kTFProfTenosrMaxDisplayLen);
+    }
+    formatted_str_ += "],\n";
+    return nstart;
+  }
+
+  template <typename T, typename U>
+  void GetValueVec(std::vector<U>* value_vec) {
+    // TODO(xpan): Address the huge tensor problem.
+    if (tensor_->NumElements() > kTFProfTensorMaxWarnLen) {
+      fprintf(stderr, "Showing huge tensor, the tool might halt...\n");
+    }
+    auto values = tensor_->flat<T>();
+    for (int64 i = 0; i < tensor_->NumElements(); i++) {
+      value_vec->push_back(static_cast<U>(values(i)));
+    }
+  }
+
+  TFProfTensorProto tfprof_tensor_pb_;
+  std::unique_ptr<Tensor> tensor_;
+  string formatted_str_;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_TENSOR_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor_test.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor_test.cc
new file mode 100644
index 00000000000..d3f1e3c7b70
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_tensor_test.cc
@@ -0,0 +1,306 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+class TFProfTensorTest : public ::testing::Test {
+ protected:
+  TFProfTensorTest() {
+    string graph_path = io::JoinPath(
+        testing::TensorFlowSrcRoot(),
+        "contrib/tfprof/tools/tfprof/internal/testdata/graph.pbtxt");
+    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    TF_CHECK_OK(ReadGraphDefText(Env::Default(), graph_path, graph_pb.get()));
+
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb;
+    std::unique_ptr<OpLog> op_log_pb;
+
+    string ckpt_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "contrib/tfprof/tools/tfprof/internal/testdata/ckpt");
+    TF_Status* status = TF_NewStatus();
+    std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader(
+        new checkpoint::CheckpointReader(ckpt_path, status));
+    CHECK(TF_GetCode(status) == TF_OK);
+    TF_DeleteStatus(status);
+
+    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
+                                std::move(op_log_pb), std::move(ckpt_reader)));
+  }
+
+  std::unique_ptr<TFStats> tf_stats_;
+};
+
+TEST_F(TFProfTensorTest, Basics) {
+  Options opts(3, 0, 0, 0, 0, {".*"}, "name", {"Variable"}, {".*"}, {""},
+               {".*"}, {""}, false, {"tensor_value"},  // show the tensor value.
+               false);
+  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+
+  TFProfNode expected;
+  CHECK(protobuf::TextFormat::ParseFromString(
+      "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
+      "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
+      "450\nchildren {\n  name: \"DW\"\n  exec_micros: 0\n  requested_bytes: "
+      "0\n  parameters: 162\n  total_exec_micros: 0\n  total_requested_bytes: "
+      "0\n  total_parameters: 162\n  float_ops: 0\n  total_float_ops: 0\n  "
+      "tensor_value {\n    dtype: DT_FLOAT\n    value_double: -0.00117808\n    "
+      "value_double: -0.000709941\n    value_double: -0.00174816\n    "
+      "value_double: -0.000495372\n    value_double: 0.000243039\n    "
+      "value_double: -0.000126313\n    value_double: -0.000663929\n    "
+      "value_double: -0.000495198\n    value_double: -0.000893934\n    "
+      "value_double: -0.00179659\n    value_double: 0.000408874\n    "
+      "value_double: -0.00120166\n    value_double: -0.00109484\n    "
+      "value_double: -0.000200362\n    value_double: 0.000726721\n    "
+      "value_double: -0.000277568\n    value_double: 0.00180584\n    "
+      "value_double: 0.000997271\n    value_double: -0.00185987\n    "
+      "value_double: -0.00113401\n    value_double: -0.000528852\n    "
+      "value_double: -0.000197412\n    value_double: 1.32871e-05\n    "
+      "value_double: -0.000285896\n    value_double: -0.000428898\n    "
+      "value_double: -0.000424633\n    value_double: 2.15488e-05\n    "
+      "value_double: 0.00149753\n    value_double: -0.000884576\n    "
+      "value_double: -0.0013795\n    value_double: -0.000650125\n    "
+      "value_double: 0.00191612\n    value_double: 4.71838e-05\n    "
+      "value_double: 0.000400201\n    value_double: 0.00239555\n    "
+      "value_double: -0.00177706\n    value_double: -0.000781899\n    "
+      "value_double: -0.00145247\n    value_double: 0.0020025\n    "
+      "value_double: 0.000597419\n    value_double: 0.00135456\n    "
+      "value_double: 0.0015876\n    value_double: -0.000993568\n    "
+      "value_double: 0.0006509\n    value_double: -0.000894533\n    "
+      "value_double: -0.00129322\n    value_double: 0.0003859\n    "
+      "value_double: 0.000415186\n    value_double: -0.000439212\n    "
+      "value_double: 0.000442138\n    value_double: 0.00212353\n    "
+      "value_double: 0.000702953\n    value_double: 0.000713424\n    "
+      "value_double: -0.000304877\n    value_double: -9.17046e-05\n    "
+      "value_double: -0.000801103\n    value_double: 0.000304854\n    "
+      "value_double: -0.00070527\n    value_double: -0.00106408\n    "
+      "value_double: -0.000909906\n    value_double: -4.49183e-05\n    "
+      "value_double: 0.000104172\n    value_double: -0.000438067\n    "
+      "value_double: -0.000317689\n    value_double: -0.000769914\n    "
+      "value_double: -0.00157729\n    value_double: 0.000220733\n    "
+      "value_double: 0.00107268\n    value_double: -0.000186449\n    "
+      "value_double: -0.000807328\n    value_double: 0.000456308\n    "
+      "value_double: -0.000593729\n    value_double: -0.000954873\n    "
+      "value_double: -0.000268676\n    value_double: 9.06328e-05\n    "
+      "value_double: -0.000323473\n    value_double: -0.000628768\n    "
+      "value_double: 0.000664985\n    value_double: 0.0020999\n    "
+      "value_double: -0.000932228\n    value_double: -0.00203203\n    "
+      "value_double: 0.000565405\n    value_double: 0.000167899\n    "
+      "value_double: 0.00054897\n    value_double: 0.000612407\n    "
+      "value_double: -0.000619301\n    value_double: 0.00169361\n    "
+      "value_double: -0.000188057\n    value_double: 0.000267652\n    "
+      "value_double: -0.00127341\n    value_double: -0.000218836\n    "
+      "value_double: -0.000431722\n    value_double: 5.41867e-05\n    "
+      "value_double: 0.000296628\n    value_double: 0.000819415\n    "
+      "value_double: -0.000758993\n    value_double: -0.000114477\n    "
+      "value_double: 6.29219e-05\n    value_double: 0.000726988\n    "
+      "value_double: -0.00135974\n    value_double: 2.28447e-05\n    "
+      "value_double: 0.00120547\n    value_double: -0.00136907\n    "
+      "value_double: -0.00140188\n    value_double: 0.000201145\n    "
+      "value_double: -0.000774109\n    value_double: 0.000798465\n    "
+      "value_double: -0.00131861\n    value_double: 3.08996e-05\n    "
+      "value_double: -0.000637026\n    value_double: 0.00228975\n    "
+      "value_double: -0.000633757\n    value_double: -0.00116047\n    "
+      "value_double: 7.66039e-05\n    value_double: 2.09167e-06\n    "
+      "value_double: -0.000296448\n    value_double: 0.000206795\n    "
+      "value_double: 0.000674405\n    value_double: -0.000722742\n    "
+      "value_double: -9.32443e-05\n    value_double: -0.00170917\n    "
+      "value_double: -0.000505279\n    value_double: 0.000628132\n    "
+      "value_double: -0.00145929\n    value_double: 0.00106077\n    "
+      "value_double: -0.000796743\n    value_double: 0.000498275\n    "
+      "value_double: -0.0002914\n    value_double: -0.00230622\n    "
+      "value_double: -9.42872e-05\n    value_double: 0.000200359\n    "
+      "value_double: -0.00305027\n    value_double: -0.0016218\n    "
+      "value_double: 0.00137126\n    value_double: -0.00215436\n    "
+      "value_double: -0.000743827\n    value_double: -0.00090007\n    "
+      "value_double: -0.000762207\n    value_double: -0.000149951\n    "
+      "value_double: -0.0013102\n    value_double: 0.00165781\n    "
+      "value_double: 0.000343809\n    value_double: -0.000826069\n    "
+      "value_double: -4.67404e-05\n    value_double: 0.0023931\n    "
+      "value_double: 0.00165338\n    value_double: -0.00050529\n    "
+      "value_double: 0.000178771\n    value_double: -0.000858287\n    "
+      "value_double: -0.00157031\n    value_double: -0.00165846\n    "
+      "value_double: -0.000713672\n    value_double: 0.00014357\n    "
+      "value_double: 0.00203632\n    value_double: -0.0010973\n    "
+      "value_double: -9.89852e-05\n    value_double: 0.000558808\n    "
+      "value_double: 0.00087211\n    value_double: 0.000661239\n    "
+      "value_double: 0.000389605\n    value_double: 0.00060653\n    "
+      "value_double: -0.000330104\n  }\n}\nchildren {\n  name: \"DW2\"\n  "
+      "exec_micros: 0\n  requested_bytes: 0\n  parameters: 288\n  "
+      "total_exec_micros: 0\n  total_requested_bytes: 0\n  total_parameters: "
+      "288\n  float_ops: 0\n  total_float_ops: 0\n  tensor_value {\n    dtype: "
+      "DT_FLOAT\n    value_double: 0.000704577\n    value_double: "
+      "0.000127421\n    value_double: 0.00105952\n    value_double: "
+      "0.000423765\n    value_double: -0.00025461\n    value_double: "
+      "-0.000857203\n    value_double: 0.000693494\n    value_double: "
+      "0.000282214\n    value_double: 0.00106185\n    value_double: "
+      "-0.000836552\n    value_double: -0.00116766\n    value_double: "
+      "0.000733674\n    value_double: -0.000669601\n    value_double: "
+      "-0.000275175\n    value_double: -0.000428215\n    value_double: "
+      "-0.000495715\n    value_double: -0.000125887\n    value_double: "
+      "-0.000715204\n    value_double: -0.00108936\n    value_double: "
+      "0.000738267\n    value_double: 0.000376081\n    value_double: "
+      "0.00191442\n    value_double: 0.001423\n    value_double: -0.00093811\n "
+      "   value_double: -5.91421e-05\n    value_double: -0.000221507\n    "
+      "value_double: -0.000104555\n    value_double: -0.00069682\n    "
+      "value_double: -0.000278325\n    value_double: -0.00122748\n    "
+      "value_double: -0.00112411\n    value_double: -0.000440511\n    "
+      "value_double: -0.000392247\n    value_double: -0.000419606\n    "
+      "value_double: -0.00167063\n    value_double: -0.000988578\n    "
+      "value_double: -0.00040159\n    value_double: 0.00238918\n    "
+      "value_double: -0.000892898\n    value_double: -0.000875976\n    "
+      "value_double: 0.00154401\n    value_double: -0.000719911\n    "
+      "value_double: 0.000753941\n    value_double: -0.000119961\n    "
+      "value_double: -0.000305115\n    value_double: 9.97947e-05\n    "
+      "value_double: -0.00128908\n    value_double: -0.000584184\n    "
+      "value_double: -0.000734685\n    value_double: -0.00146612\n    "
+      "value_double: 0.000670802\n    value_double: 0.000924219\n    "
+      "value_double: -0.000154409\n    value_double: 0.000198231\n    "
+      "value_double: -0.000340742\n    value_double: -0.00159646\n    "
+      "value_double: -1.19382e-05\n    value_double: 0.00165203\n    "
+      "value_double: 0.0017085\n    value_double: -0.000199614\n    "
+      "value_double: 0.000529526\n    value_double: 0.000769364\n    "
+      "value_double: 0.00135369\n    value_double: 0.00132873\n    "
+      "value_double: 0.000451174\n    value_double: 0.000255218\n    "
+      "value_double: 0.00102891\n    value_double: -0.00160068\n    "
+      "value_double: 0.000324269\n    value_double: -0.000492347\n    "
+      "value_double: 0.000925301\n    value_double: 0.00281998\n    "
+      "value_double: -0.000826404\n    value_double: -0.000602903\n    "
+      "value_double: 0.00126559\n    value_double: 0.000924364\n    "
+      "value_double: -9.19827e-05\n    value_double: -5.59275e-05\n    "
+      "value_double: 0.00107971\n    value_double: -9.91756e-05\n    "
+      "value_double: 0.000864708\n    value_double: 0.00121747\n    "
+      "value_double: 0.00146338\n    value_double: 0.000186883\n    "
+      "value_double: -0.00168195\n    value_double: -0.00062029\n    "
+      "value_double: 0.000658127\n    value_double: 0.00115682\n    "
+      "value_double: -0.00178359\n    value_double: 0.000685606\n    "
+      "value_double: -0.000503373\n    value_double: -0.000312999\n    "
+      "value_double: 0.000335383\n    value_double: -1.08597e-05\n    "
+      "value_double: -8.2499e-05\n    value_double: -0.000469726\n    "
+      "value_double: -0.00170868\n    value_double: 0.000118957\n    "
+      "value_double: -0.000460736\n    value_double: -5.56372e-05\n    "
+      "value_double: -0.00110148\n    value_double: 0.00059123\n    "
+      "value_double: 0.000386339\n    value_double: -0.00139967\n    "
+      "value_double: -0.000835664\n    value_double: 0.00103421\n    "
+      "value_double: -0.00104296\n    value_double: -0.000687497\n    "
+      "value_double: 1.1338e-05\n    value_double: 0.00176484\n    "
+      "value_double: 0.000531523\n    value_double: -0.000986387\n    "
+      "value_double: -0.00114152\n    value_double: 0.000256744\n    "
+      "value_double: 0.000228425\n    value_double: 0.00116583\n    "
+      "value_double: 0.0002726\n    value_double: -0.00100828\n    "
+      "value_double: -0.000950376\n    value_double: -0.00229074\n    "
+      "value_double: -0.000348272\n    value_double: -0.000526032\n    "
+      "value_double: -0.000133703\n    value_double: 0.000310979\n    "
+      "value_double: -0.00199278\n    value_double: -0.000874469\n    "
+      "value_double: -0.000631466\n    value_double: 0.0010534\n    "
+      "value_double: 0.00134646\n    value_double: -0.00172743\n    "
+      "value_double: 0.00131031\n    value_double: -0.000697506\n    "
+      "value_double: 0.000286747\n    value_double: 0.000140759\n    "
+      "value_double: 0.000568707\n    value_double: 0.000108177\n    "
+      "value_double: -0.00207337\n    value_double: -0.00138146\n    "
+      "value_double: 0.000483162\n    value_double: -0.00167096\n    "
+      "value_double: -0.000465813\n    value_double: 0.00067724\n    "
+      "value_double: 2.08388e-05\n    value_double: -0.00203279\n    "
+      "value_double: 7.8429e-05\n    value_double: 0.00161337\n    "
+      "value_double: -0.000269005\n    value_double: 0.000217822\n    "
+      "value_double: 0.000599886\n    value_double: 0.000317549\n    "
+      "value_double: 0.00146597\n    value_double: -0.00210947\n    "
+      "value_double: -0.000823917\n    value_double: -6.83766e-05\n    "
+      "value_double: 0.000656085\n    value_double: 0.000117134\n    "
+      "value_double: -0.000390405\n    value_double: 2.39565e-05\n    "
+      "value_double: 0.00104837\n    value_double: -0.000563671\n    "
+      "value_double: 0.000634073\n    value_double: -0.000554531\n    "
+      "value_double: 0.000677971\n    value_double: -0.000596207\n    "
+      "value_double: -0.00103335\n    value_double: 0.000645199\n    "
+      "value_double: 0.00162195\n    value_double: 0.000239246\n    "
+      "value_double: 0.00113519\n    value_double: 0.000787431\n    "
+      "value_double: -0.000471688\n    value_double: -0.000216625\n    "
+      "value_double: -0.000537156\n    value_double: 0.000551816\n    "
+      "value_double: 0.00094337\n    value_double: -0.000708127\n    "
+      "value_double: 0.000956955\n    value_double: -0.000904936\n    "
+      "value_double: -0.000424413\n    value_double: 0.000106455\n    "
+      "value_double: -0.000443952\n    value_double: 0.000185436\n    "
+      "value_double: 0.000944397\n    value_double: -0.000760572\n    "
+      "value_double: 0.000560002\n    value_double: 4.09886e-05\n    "
+      "value_double: -0.00075076\n    value_double: -0.000701856\n    "
+      "value_double: -0.000234851\n    value_double: -0.000131515\n    "
+      "value_double: -0.000761718\n    value_double: -0.000267808\n    "
+      "value_double: -0.00039682\n    value_double: 0.000542953\n    "
+      "value_double: -0.000817685\n    value_double: 0.00103851\n    "
+      "value_double: -0.000427176\n    value_double: 0.000517784\n    "
+      "value_double: -0.000823552\n    value_double: -0.000742637\n    "
+      "value_double: 0.000529213\n    value_double: -0.000372805\n    "
+      "value_double: 1.85745e-05\n    value_double: 0.00139891\n    "
+      "value_double: -0.000128417\n    value_double: -0.000404316\n    "
+      "value_double: -0.000671571\n    value_double: 0.000490311\n    "
+      "value_double: -0.00118493\n    value_double: -0.000897118\n    "
+      "value_double: 0.000939601\n    value_double: 0.000376399\n    "
+      "value_double: 0.0014709\n    value_double: 0.000134806\n    "
+      "value_double: -0.000294469\n    value_double: -0.000569142\n    "
+      "value_double: 0.00127266\n    value_double: -0.00140936\n    "
+      "value_double: 0.000870083\n    value_double: 0.000287246\n    "
+      "value_double: 0.000537685\n    value_double: 0.000125569\n    "
+      "value_double: 0.000360276\n    value_double: -0.000186268\n    "
+      "value_double: 0.0011141\n    value_double: -0.000605185\n    "
+      "value_double: -0.0016281\n    value_double: -0.000552758\n    "
+      "value_double: -0.000196755\n    value_double: -0.00265188\n    "
+      "value_double: 0.000480997\n    value_double: 0.00018776\n    "
+      "value_double: -0.00199234\n    value_double: 0.000959982\n    "
+      "value_double: 0.00040334\n    value_double: -0.000693596\n    "
+      "value_double: 0.00157678\n    value_double: -0.00134499\n    "
+      "value_double: 0.00121909\n    value_double: -0.000328734\n    "
+      "value_double: 0.000148554\n    value_double: -0.000209509\n    "
+      "value_double: -0.000266303\n    value_double: -0.00134084\n    "
+      "value_double: 5.21371e-05\n    value_double: 0.0005329\n    "
+      "value_double: -0.000168858\n    value_double: -0.00074875\n    "
+      "value_double: 0.000959397\n    value_double: -0.00159476\n    "
+      "value_double: -0.000368838\n    value_double: 0.0006077\n    "
+      "value_double: -0.00117243\n    value_double: -0.00146013\n    "
+      "value_double: 0.00031519\n    value_double: -0.000167911\n    "
+      "value_double: 0.000482571\n    value_double: -0.000752268\n    "
+      "value_double: -0.00042363\n    value_double: 0.00121219\n    "
+      "value_double: -0.000208159\n    value_double: 0.000128531\n    "
+      "value_double: -0.000406308\n    value_double: -0.000242663\n    "
+      "value_double: -3.96673e-05\n    value_double: 0.00144854\n    "
+      "value_double: -0.000787328\n    value_double: -0.000401958\n    "
+      "value_double: 0.00114091\n    value_double: -0.000739546\n    "
+      "value_double: 0.000483236\n    value_double: -0.000916945\n    "
+      "value_double: -0.00129577\n    value_double: -0.00186504\n    "
+      "value_double: 0.000806804\n    value_double: -0.000152251\n    "
+      "value_double: 0.000662576\n    value_double: -0.000533236\n    "
+      "value_double: 0.00151019\n    value_double: 0.00127805\n    "
+      "value_double: 0.00115399\n    value_double: -0.00130876\n    "
+      "value_double: 2.99457e-06\n    value_double: 0.000820777\n    "
+      "value_double: 0.000878393\n    value_double: -0.000562642\n    "
+      "value_double: -0.00070442\n    value_double: -0.00066277\n  "
+      "}\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
+      &expected));
+  EXPECT_EQ(expected.DebugString(), root.DebugString());
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.cc b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.cc
new file mode 100644
index 00000000000..7610729a118
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.cc
@@ -0,0 +1,350 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
+
+#include <stdio.h>
+#include <algorithm>
+#include <memory>
+#include <set>
+
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace tensorflow {
+namespace tfprof {
+string FormatNumber(int64 n) {
+  if (n < 1000) {
+    return strings::Printf("%lld", n);
+  } else if (n < 1000000) {
+    return strings::Printf("%.2fk", n / 1000.0);
+  } else if (n < 1000000000) {
+    return strings::Printf("%.2fm", n / 1000000.0);
+  } else {
+    return strings::Printf("%.2fb", n / 1000000000.0);
+  }
+}
+
+string FormatTime(int64 micros) {
+  if (micros < 1000) {
+    return strings::Printf("%lldus", micros);
+  } else if (micros < 1000000) {
+    return strings::Printf("%.2fms", micros / 1000.0);
+  } else {
+    return strings::Printf("%.2fsec", micros / 1000000.0);
+  }
+}
+
+string FormatMemory(int64 bytes) {
+  if (bytes < 1000) {
+    return strings::Printf("%lldB", bytes);
+  } else if (bytes < 1000000) {
+    return strings::Printf("%.2fKB", bytes / 1000.0);
+  } else {
+    return strings::Printf("%.2fMB", bytes / 1000000.0);
+  }
+}
+
+string FormatShapes(const std::vector<int64>& shape) {
+  return str_util::Join(shape, "x");
+}
+
+string StringReplace(const string& str, const string& oldsub,
+                     const string& newsub) {
+  string out = str;
+  RE2::GlobalReplace(&out, oldsub, newsub);
+  return out;
+}
+
+Status ReadGraphDefText(Env* env, const string& fname, GraphDef* graph_def) {
+  string out;
+  Status s = ReadFileToString(env, fname, &out);
+  if (!s.ok()) return s;
+  if (protobuf::TextFormat::ParseFromString(out, graph_def)) {
+    return Status();
+  }
+  return errors::InvalidArgument("Cannot parse proto string.");
+}
+
+namespace {
+string StripQuote(const string& s) {
+  int start = s.find_first_not_of("\"\'");
+  int end = s.find_last_not_of("\"\'");
+  if (start == s.npos || end == s.npos) return "";
+
+  return s.substr(start, end - start + 1);
+}
+
+tensorflow::Status ReturnError(const std::vector<string> pieces, int idx) {
+  string val;
+  if (pieces.size() > idx + 1) {
+    val = pieces[idx + 1];
+  }
+  return tensorflow::Status(
+      tensorflow::error::INVALID_ARGUMENT,
+      strings::StrCat("Invalid option '", pieces[idx], "' value: '", val, "'"));
+}
+
+bool CaseEqual(StringPiece s1, StringPiece s2) {
+  if (s1.size() != s2.size()) return false;
+  return str_util::Lowercase(s1) == str_util::Lowercase(s2);
+}
+
+bool StringToBool(StringPiece str, bool* value) {
+  CHECK(value != NULL) << "NULL output boolean given.";
+  if (CaseEqual(str, "true") || CaseEqual(str, "t") || CaseEqual(str, "yes") ||
+      CaseEqual(str, "y") || CaseEqual(str, "1")) {
+    *value = true;
+    return true;
+  }
+  if (CaseEqual(str, "false") || CaseEqual(str, "f") || CaseEqual(str, "no") ||
+      CaseEqual(str, "n") || CaseEqual(str, "0")) {
+    *value = false;
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
+tensorflow::Status ParseCmdLine(const string& line, string* cmd,
+                                tensorflow::tfprof::Options* opts) {
+  std::vector<string> pieces =
+      str_util::Split(line, ' ', str_util::SkipEmpty());
+
+  std::vector<string> cmds_str(kCmds, kCmds + sizeof(kCmds) / sizeof(*kCmds));
+  if (std::find(cmds_str.begin(), cmds_str.end(), pieces[0]) ==
+      cmds_str.end()) {
+    return tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
+                              "First string must be a valid command.");
+  }
+  *cmd = pieces[0];
+
+  for (int i = 1; i < pieces.size(); ++i) {
+    if (pieces[i] == string(tensorflow::tfprof::kOptions[0])) {
+      if (pieces.size() <= i + 1 ||
+          !strings::safe_strto32(pieces[i + 1], &opts->max_depth)) {
+        return ReturnError(pieces, i);
+      }
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[1]) {
+      if (pieces.size() <= i + 1 ||
+          !strings::safe_strto64(pieces[i + 1], &opts->min_bytes)) {
+        return ReturnError(pieces, i);
+      }
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[2]) {
+      if (pieces.size() <= i + 1 ||
+          !strings::safe_strto64(pieces[i + 1], &opts->min_micros)) {
+        return ReturnError(pieces, i);
+      }
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[3]) {
+      if (pieces.size() <= i + 1 ||
+          !strings::safe_strto64(pieces[i + 1], &opts->min_params)) {
+        return ReturnError(pieces, i);
+      }
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[4]) {
+      if (pieces.size() <= i + 1 ||
+          !strings::safe_strto64(pieces[i + 1], &opts->min_float_ops)) {
+        return ReturnError(pieces, i);
+      }
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[5]) {
+      if (pieces.size() <= i + 1) {
+        return ReturnError(pieces, i);
+      }
+      opts->device_regexes = str_util::Split(StripQuote(pieces[i + 1]), ',',
+                                             str_util::SkipEmpty());
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[6]) {
+      if (pieces.size() <= i + 1) {
+        return ReturnError(pieces, i);
+      }
+      std::set<string> order_by_set(
+          kOrderBy, kOrderBy + sizeof(kOrderBy) / sizeof(*kOrderBy));
+      auto order_by = order_by_set.find(pieces[i + 1]);
+      if (order_by == order_by_set.end()) {
+        return ReturnError(pieces, i);
+      }
+      opts->order_by = *order_by;
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[7]) {
+      if (pieces.size() <= i + 1) {
+        return ReturnError(pieces, i);
+      }
+      opts->account_type_regexes = str_util::Split(StripQuote(pieces[i + 1]),
+                                                   ',', str_util::SkipEmpty());
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[8]) {
+      if (pieces.size() <= i + 1) {
+        return ReturnError(pieces, i);
+      }
+      opts->start_name_regexes = str_util::Split(StripQuote(pieces[i + 1]), ',',
+                                                 str_util::SkipEmpty());
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[9]) {
+      if (pieces.size() <= i + 1) {
+        return ReturnError(pieces, i);
+      }
+      opts->trim_name_regexes = str_util::Split(StripQuote(pieces[i + 1]), ',',
+                                                str_util::SkipEmpty());
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[10]) {
+      if (pieces.size() <= i + 1) {
+        return ReturnError(pieces, i);
+      }
+      opts->show_name_regexes = str_util::Split(StripQuote(pieces[i + 1]), ',',
+                                                str_util::SkipEmpty());
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[11]) {
+      if (pieces.size() <= i + 1) {
+        return ReturnError(pieces, i);
+      }
+      opts->hide_name_regexes = str_util::Split(StripQuote(pieces[i + 1]), ',',
+                                                str_util::SkipEmpty());
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[12]) {
+      if ((pieces.size() > i + 1 && pieces[i + 1].find("-") == 0) ||
+          pieces.size() == i + 1) {
+        opts->account_displayed_op_only = true;
+      } else if (!StringToBool(pieces[i + 1],
+                               &opts->account_displayed_op_only)) {
+        return ReturnError(pieces, i);
+      } else {
+        ++i;
+      }
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[13]) {
+      if (pieces.size() <= i + 1) {
+        return ReturnError(pieces, i);
+      }
+      std::set<string> shown_set(kShown,
+                                 kShown + sizeof(kShown) / sizeof(*kShown));
+      std::vector<string> requested_vector = str_util::Split(
+          StripQuote(pieces[i + 1]), ',', str_util::SkipEmpty());
+      std::set<string> requested_set(requested_vector.begin(),
+                                     requested_vector.end());
+      for (const string& requested : requested_set) {
+        if (shown_set.find(requested) == shown_set.end()) {
+          return ReturnError(pieces, i);
+        }
+      }
+      opts->select = requested_set;
+      ++i;
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[14]) {
+      if ((pieces.size() > i + 1 && pieces[i + 1].find("-") == 0) ||
+          pieces.size() == i + 1) {
+        opts->viz = true;
+      } else if (!StringToBool(pieces[i + 1], &opts->viz)) {
+        return ReturnError(pieces, i);
+      } else {
+        ++i;
+      }
+    } else if (pieces[i] == tensorflow::tfprof::kOptions[15]) {
+      if (pieces.size() <= i + 1) {
+        return ReturnError(pieces, i);
+      }
+      opts->dump_to_file = StripQuote(pieces[i + 1]);
+      ++i;
+    } else {
+      return ReturnError(pieces, i);
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+void PrintHelp() {
+  printf(
+      "\nSee go/tfprof for detail tutorial.\n"
+      "\nCommands\n\n"
+      "  scope: Each op has its op name in TensorFlow, such as 'n1', 'n1/n2', "
+      "'n1/n2/n3'. 'n1/n2' is a child of 'n1'. 'scope' command builds "
+      "a name scope tree and aggregates statistics based on it.\n\n"
+      "  graph: ops in TensorFlow are organized as a graph based on their "
+      "the source (inputs) and sink (outputs). 'graph' command builds "
+      "a graph pointing *from output to input*, and aggregates "
+      "statistics based on it.\n\n"
+      "  set: Set options that will be default for follow up commands.\n\n"
+      "  help: Show helps.\n"
+      "\nOptions\n\n"
+      "Press Enter in CLI to see default option values.\n\n"
+      "  -max_depth: Show ops that are at most this number of hops from "
+      "starting op in the tree/graph structure.\n\n"
+      "  -min_bytes: Show ops that request at least this number of bytes.\n\n"
+      "  -min_micros: Show ops that spend at least this number of micros to "
+      "run.\n\n"
+      "  -min_params: Show ops that contains at least this number of "
+      "parameters.\n\n"
+      "  -min_float_ops: Show ops that contain at least this number of "
+      "float operations. Only available if an op has "
+      "op.RegisterStatistics() defined and OpLog is "
+      "provided\n\n"
+      "  -device_regexes: Show ops that a placed on the specified devices. "
+      "regexes are comma-separated.\n\n"
+      "  -order_by: Order the results by [name|depth|bytes|micros|params|"
+      "float_ops]\n\n"
+      "  -account_type_regexes: Account and display the ops whose types match "
+      "one of the type regexes specified. tfprof "
+      "allow user to define extra op types for ops "
+      "through tensorflow.tfprof.OpLog proto. regexes "
+      "are comma-sperated.\n\n"
+      "  -start_name_regexes: Show ops starting from the ops that matches the "
+      "regexes, recursively. regexes are "
+      "comma-separated.\n\n"
+      "  -trim_name_regexes: Hide ops starting from the ops that matches the "
+      "regexes, recursively, regexes are comma-seprated. "
+      "\n\n"
+      "  -show_name_regexes: Show ops that match the regexes. regexes are "
+      "comma-seprated.\n\n"
+      "  -hide_name_regexes: Hide ops that match the regexes. regexes are "
+      "comma-seprated.\n\n"
+      ""
+      "  Notes: For each op, -acount_type_regexes is first evaluated, "
+      "only ops with types matching the specified regexes are accounted and "
+      "selected for displayed. -start/trim/show/hide_name_regexes are used "
+      "to further filter ops for display. -start_name_regexes is evaluated "
+      "first to search the starting ops to display. Descendants of starting "
+      "ops are then evaluated against show/hide_name_regexes to make display "
+      "decision. If an op matches trim_name_regexes, all its descendants are "
+      "hidden.\n"
+      "Ops statistics are *accounted even if they are hidden* as long as "
+      "they match the -account_xxx options.\n\n"
+      "  -account_displayed_op_only: If True, only account the statistics of "
+      "ops eventually displayed. If False, account all "
+      "op statistics matching -account_type_regexes recursively.\n\n"
+      "  -select: Comma-separated list of metrics to show: [bytes|micros|"
+      "params|float_ops|num_hidden_ops|tensor_value|device|op_types]."
+      "\n\n"
+      "  -dump_to_file: Dump the output to a file, instead of terminal.\n\n"
+      ""
+      "Examples\n"
+      "  Assuming a toy model:\n"
+      "    intput(typeB)->conv2d_1(typeA)->conv2d_2(typeA)->"
+      "fc(typeA)->cost(typeA)->summarize(typeC)\n"
+      "  Command:\n"
+      "    tfprof> graph -account_type_regexes typeA -start_name_regexes "
+      "cost.* -show_name_regexes conv2d.* -max_depth 10\n\n"
+      "  The above command only aggregate statistics of all ops of typeA ("
+      "hence ignoring input(typeB)). It will start looking for candidate to "
+      "display from cost.* and finally displays conv2d_1 and conv2d_2.\n\n");
+  fflush(stdout);
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h
new file mode 100644
index 00000000000..6c1bba04fc2
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace tfprof {
+string FormatNumber(int64 n);
+
+string FormatTime(int64 micros);
+
+string FormatMemory(int64 bytes);
+
+string FormatShapes(const std::vector<int64>& shapes);
+
+tensorflow::Status ParseCmdLine(const string& line, string* cmd,
+                                tensorflow::tfprof::Options* opts);
+
+string StringReplace(const string& str, const string& oldsub,
+                     const string& newsub);
+
+Status ReadGraphDefText(Env* env, const string& fname, GraphDef* graph_def);
+
+void PrintHelp();
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TFPROF_TOOLS_TFPROF_INTERNAL_TFPROF_UTILS_H_
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.proto b/tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.proto
new file mode 100644
index 00000000000..cae6e1e3a8c
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.proto
@@ -0,0 +1,19 @@
+syntax = "proto2";
+
+package tensorflow.tfprof;
+
+message OpLogEntry {
+  // op name.
+  optional string name = 1;
+  // float_ops is filled by tfprof Python API when called. It requires the
+  // op has RegisterStatistics defined. Currently, Conv2D, MatMul, etc, are
+  // implemented.
+  optional int64 float_ops = 2;
+  // User can define extra op type information for an op. This allows the user
+  // to select a group of ops precisely using op_type as a key.
+  repeated string types = 3;
+}
+
+message OpLog {
+  repeated OpLogEntry log_entries = 1;
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/tfprof_main.cc b/tensorflow/contrib/tfprof/tools/tfprof/tfprof_main.cc
new file mode 100644
index 00000000000..d9080242d6b
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/tfprof_main.cc
@@ -0,0 +1,236 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "linenoise.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_stats.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/contrib/tfprof/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+using tensorflow::str_util::Split;
+
+void completion(const char* buf, linenoiseCompletions* lc) {
+  tensorflow::string buf_str = tensorflow::string(buf);
+  if (buf_str.find(" ") == buf_str.npos) {
+    for (const char* opt : tensorflow::tfprof::kCmds) {
+      if (tensorflow::string(opt).find(buf_str) == 0) {
+        linenoiseAddCompletion(lc, opt);
+      }
+    }
+    return;
+  }
+
+  tensorflow::string prefix;
+  int last_dash = buf_str.find_last_of(' ');
+  if (last_dash != tensorflow::string::npos) {
+    prefix = buf_str.substr(0, last_dash + 1);
+    buf_str = buf_str.substr(last_dash + 1, tensorflow::kint32max);
+  }
+  for (const char* opt : tensorflow::tfprof::kOptions) {
+    if (tensorflow::string(opt).find(buf_str) == 0) {
+      linenoiseAddCompletion(lc, (prefix + opt).c_str());
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  tensorflow::string FLAGS_graph_path = "";
+  tensorflow::string FLAGS_run_meta_path = "";
+  tensorflow::string FLAGS_op_log_path = "";
+  tensorflow::string FLAGS_checkpoint_path = "";
+  tensorflow::int32 FLAGS_max_depth = 4;
+  tensorflow::int64 FLAGS_min_bytes = 0;
+  tensorflow::int64 FLAGS_min_micros = 0;
+  tensorflow::int64 FLAGS_min_params = 0;
+  tensorflow::int64 FLAGS_min_float_ops = 0;
+  tensorflow::string FLAGS_device_regexes = ".*";
+  tensorflow::string FLAGS_order_by = "name";
+  tensorflow::string FLAGS_account_type_regexes = "Variable";
+  tensorflow::string FLAGS_start_name_regexes = ".*";
+  tensorflow::string FLAGS_trim_name_regexes = "";
+  tensorflow::string FLAGS_show_name_regexes = ".*";
+  tensorflow::string FLAGS_hide_name_regexes;
+  bool FLAGS_account_displayed_op_only = false;
+  tensorflow::string FLAGS_select = "params";
+  bool FLAGS_viz = false;
+  tensorflow::string FLAGS_dump_to_file = "";
+  for (int i = 0; i < argc; i++) {
+    fprintf(stderr, "%s\n", argv[i]);
+  }
+
+  CHECK(tensorflow::ParseFlags(
+      &argc, argv,
+      {tensorflow::Flag("graph_path", &FLAGS_graph_path),
+       tensorflow::Flag("run_meta_path", &FLAGS_run_meta_path),
+       tensorflow::Flag("op_log_path", &FLAGS_op_log_path),
+       tensorflow::Flag("checkpoint_path", &FLAGS_checkpoint_path),
+       tensorflow::Flag("max_depth", &FLAGS_max_depth),
+       tensorflow::Flag("min_bytes", &FLAGS_min_bytes),
+       tensorflow::Flag("min_micros", &FLAGS_min_micros),
+       tensorflow::Flag("min_params", &FLAGS_min_params),
+       tensorflow::Flag("min_float_ops", &FLAGS_min_float_ops),
+       tensorflow::Flag("device_regexes", &FLAGS_device_regexes),
+       tensorflow::Flag("order_by", &FLAGS_order_by),
+       tensorflow::Flag("account_type_regexes", &FLAGS_start_name_regexes),
+       tensorflow::Flag("trim_name_regexes", &FLAGS_trim_name_regexes),
+       tensorflow::Flag("show_name_regexes", &FLAGS_show_name_regexes),
+       tensorflow::Flag("hide_name_regexes", &FLAGS_hide_name_regexes),
+       tensorflow::Flag("account_displayed_op_only",
+                        &FLAGS_account_displayed_op_only),
+       tensorflow::Flag("select", &FLAGS_select),
+       tensorflow::Flag("dump_to_file", &FLAGS_dump_to_file)}));
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  fprintf(stderr, "%s\n", FLAGS_graph_path.c_str());
+
+  std::vector<tensorflow::string> device_regexes =
+      Split(FLAGS_device_regexes, ',', tensorflow::str_util::SkipEmpty());
+  std::vector<tensorflow::string> account_type_regexes =
+      Split(FLAGS_account_type_regexes, ',', tensorflow::str_util::SkipEmpty());
+  std::vector<tensorflow::string> start_name_regexes =
+      Split(FLAGS_start_name_regexes, ',', tensorflow::str_util::SkipEmpty());
+  std::vector<tensorflow::string> trim_name_regexes =
+      Split(FLAGS_trim_name_regexes, ',', tensorflow::str_util::SkipEmpty());
+  std::vector<tensorflow::string> show_name_regexes =
+      Split(FLAGS_show_name_regexes, ',', tensorflow::str_util::SkipEmpty());
+  std::vector<tensorflow::string> hide_name_regexes =
+      Split(FLAGS_hide_name_regexes, ',', tensorflow::str_util::SkipEmpty());
+  std::vector<tensorflow::string> select =
+      Split(FLAGS_select, ',', tensorflow::str_util::SkipEmpty());
+
+  tensorflow::string cmd = "";
+  if (argc == 1 && FLAGS_graph_path.empty()) {
+    printf("1) go/tfprof: Tutorial.\n");
+    printf("2) tfprof help: Detail help information.\n");
+    printf(
+        "3) tfprof --graph_path <GraphDef proto text file>: "
+        "Profiling model structure, tensor shape and # parameters.\n");
+    printf(
+        "4) tfprof --graph_path <GraphDef proto text file> \\\n"
+        "          --run_meta_path <RunMetadata proto binary file> \\\n"
+        "          --op_log_path <tensorflow::tfprof::OpLog proto binary file> "
+        "\\\n"
+        "          --checkpoint_path <TensorFlow Checkpoint file>: "
+        "Profiling everything!\n");
+    return 0;
+  } else if (argc > 1) {
+    if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[3]) {
+      tensorflow::tfprof::PrintHelp();
+      return 0;
+    }
+    if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[0] ||
+        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[1]) {
+      cmd = argv[1];
+    }
+  }
+
+  printf("Reading Files...\n");
+  std::unique_ptr<tensorflow::GraphDef> graph(new tensorflow::GraphDef());
+  TF_CHECK_OK(tensorflow::tfprof::ReadGraphDefText(
+      tensorflow::Env::Default(), FLAGS_graph_path, graph.get()));
+
+  std::unique_ptr<tensorflow::RunMetadata> run_meta(
+      new tensorflow::RunMetadata());
+  if (!ReadBinaryProto(tensorflow::Env::Default(), FLAGS_run_meta_path,
+                       run_meta.get())
+           .ok()) {
+    run_meta.release();
+  }
+
+  std::unique_ptr<tensorflow::tfprof::OpLog> op_log(
+      new tensorflow::tfprof::OpLog());
+  if (!ReadBinaryProto(tensorflow::Env::Default(), FLAGS_op_log_path,
+                       op_log.get())
+           .ok()) {
+    op_log.release();
+  }
+
+  std::unique_ptr<tensorflow::checkpoint::CheckpointReader> ckpt_reader;
+  TF_Status* status = TF_NewStatus();
+  if (!FLAGS_checkpoint_path.empty()) {
+    ckpt_reader.reset(new tensorflow::checkpoint::CheckpointReader(
+        FLAGS_checkpoint_path, status));
+    if (TF_GetCode(status) != TF_OK) {
+      fprintf(stderr, "%s\n", TF_Message(status));
+      TF_DeleteStatus(status);
+      return 1;
+    }
+    TF_DeleteStatus(status);
+  }
+
+  tensorflow::tfprof::TFStats tf_stat(std::move(graph), std::move(run_meta),
+                                      std::move(op_log),
+                                      std::move(ckpt_reader));
+  tensorflow::tfprof::Options opts(
+      FLAGS_max_depth, FLAGS_min_bytes, FLAGS_min_micros, FLAGS_min_params,
+      FLAGS_min_float_ops, device_regexes, FLAGS_order_by, account_type_regexes,
+      start_name_regexes, trim_name_regexes, show_name_regexes,
+      hide_name_regexes, FLAGS_account_displayed_op_only, select, FLAGS_viz,
+      FLAGS_dump_to_file);
+
+  if (!cmd.empty()) {
+    tf_stat.PrintGraph(cmd, opts);
+    return 0;
+  }
+
+  linenoiseSetCompletionCallback(completion);
+  linenoiseHistoryLoad(".tfprof_history.txt");
+
+  for (char* line = nullptr; (line = linenoise("tfprof> ")) != nullptr;) {
+    tensorflow::string line_s = tensorflow::string(line);
+    free(line);
+
+    if (line_s.empty()) {
+      printf("%s", opts.ToString().c_str());
+      continue;
+    }
+    linenoiseHistoryAdd(line_s.c_str());
+    linenoiseHistorySave(".tfprof_history.txt");
+
+    tensorflow::tfprof::Options new_opts = opts;
+    tensorflow::Status s =
+        tensorflow::tfprof::ParseCmdLine(line_s, &cmd, &new_opts);
+    if (!s.ok()) {
+      fprintf(stderr, "E: %s\n", s.ToString().c_str());
+      continue;
+    }
+    if (cmd == tensorflow::tfprof::kCmds[2]) {
+      opts = new_opts;
+    } else if (cmd == tensorflow::tfprof::kCmds[3]) {
+      tensorflow::tfprof::PrintHelp();
+    } else {
+      tf_stat.PrintGraph(cmd, new_opts);
+    }
+  }
+  return 0;
+}
diff --git a/tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.proto b/tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.proto
new file mode 100644
index 00000000000..9afd41046e4
--- /dev/null
+++ b/tensorflow/contrib/tfprof/tools/tfprof/tfprof_output.proto
@@ -0,0 +1,49 @@
+syntax = "proto2";
+
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow.tfprof;
+
+message TFProfTensorProto {
+  optional DataType dtype = 1;
+  // Flatten tensor in row-major.
+  // Only one of the following array is set.
+  repeated double value_double = 2;
+  repeated int64 value_int64 = 3;
+  repeated string value_str = 4;
+}
+
+message TFProfNode {
+  // op name.
+  optional string name = 1;
+  // tensor value restored from checkpoint.
+  optional TFProfTensorProto tensor_value = 15;
+  // op execution time.
+  optional int64 exec_micros = 2;
+  // Total requested bytes by the op.
+  optional int64 requested_bytes = 3;
+  // Number of parameters if available.
+  optional int64 parameters = 4;
+  // Number of float operations.
+  optional int64 float_ops = 13;
+  // Number of inputs to the op.
+  optional int64 inputs = 5;
+  // Device the op is assigned to.
+  optional string device = 10;
+
+  // The following are the aggregated stats from all accounted descendants and
+  // the op itself. The actual descendants depend on the data structure used
+  // (scope, graph).
+  optional int64 total_exec_micros = 6;
+  optional int64 total_requested_bytes = 7;
+  optional int64 total_parameters = 8;
+  optional int64 total_float_ops = 14;
+  optional int64 total_inputs = 9;
+
+  // shape information, if available.
+  repeated TensorShapeProto shapes = 11;
+  // Descendants of the graph. The actual descendants depend on the data
+  // structure used (scope, graph).
+  repeated TFProfNode children = 12;
+}
\ No newline at end of file
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 6c1733ea472..3e0c58a7172 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -77,6 +77,7 @@ load(
     "tf_proto_library_cc",
     "tf_additional_lib_hdrs",
     "tf_additional_lib_srcs",
+    "tf_additional_proto_hdrs",
     "tf_additional_proto_srcs",
     "tf_additional_lib_deps",
     "tf_additional_stream_executor_srcs",
@@ -127,7 +128,7 @@ cc_library(
         "platform/platform.h",
         "platform/protobuf.h",
         "platform/types.h",
-    ],
+    ] + glob(tf_additional_proto_hdrs()),
     deps = [
         ":protos_all_cc",
         "//tensorflow/core/platform/default/build_config:proto_parsing",
@@ -155,7 +156,6 @@ cc_library(
         "lib/hash/crc32c.h",  # TODO(josh11b): make internal
         "lib/histogram/histogram.h",
         "lib/io/inputbuffer.h",  # TODO(josh11b): make internal
-        "lib/io/match.h",  # TODO(vrv,jeff): remove once Env->Match change is in
         "lib/io/path.h",
         "lib/io/proto_encode_helper.h",
         "lib/io/record_reader.h",
@@ -911,11 +911,15 @@ cc_library(
             "platform/profile_utils/**/*.h",
             "platform/profile_utils/**/*.cc",
         ] + tf_additional_lib_srcs(),
-        exclude = [
-            "**/*test*",
-            "platform/**/cuda.h",
-            "platform/**/stream_executor.h",
-        ],
+        exclude =
+            [
+                "**/*test*",
+                "platform/**/cuda.h",
+                "platform/**/stream_executor.h",
+            ] +
+            # Protobuf deps already included through the ":lib_proto_parsing"
+            # dependency.
+            tf_additional_proto_srcs(),
     ),
     hdrs = glob(tf_additional_lib_hdrs()) + [
         "lib/core/blocking_counter.h",
@@ -932,7 +936,6 @@ cc_library(
         "lib/io/buffered_inputstream.h",
         "lib/io/inputstream_interface.h",
         "lib/io/iterator.h",
-        "lib/io/match.h",
         "lib/io/random_inputstream.h",
         "lib/io/snappy/snappy_inputbuffer.h",
         "lib/io/snappy/snappy_outputbuffer.h",
@@ -960,6 +963,7 @@ cc_library(
     copts = tf_copts(),
     linkopts = ["-ldl"],
     deps = [
+        ":lib_proto_parsing",
         ":protos_all_cc",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//third_party/eigen3",
@@ -1384,7 +1388,6 @@ tf_cc_tests(
         "lib/io/buffered_inputstream_test.cc",
         "lib/io/inputbuffer_test.cc",
         "lib/io/inputstream_interface_test.cc",
-        "lib/io/match_test.cc",
         "lib/io/path_test.cc",
         "lib/io/random_inputstream_test.cc",
         "lib/io/record_reader_writer_test.cc",
@@ -1408,7 +1411,6 @@ tf_cc_tests(
         "lib/strings/strcat_test.cc",
         "lib/strings/stringprintf_test.cc",
         "lib/wav/wav_io_test.cc",
-        "platform/file_system_test.cc",
         "platform/fingerprint_test.cc",
         "platform/integral_types_test.cc",
         "platform/logging_test.cc",
@@ -1727,6 +1729,7 @@ tf_cc_test(
     name = "common_runtime_direct_session_with_tracking_alloc_test",
     size = "small",
     srcs = ["common_runtime/direct_session_with_tracking_alloc_test.cc"],
+    args = ["--heap_check=local"],  # The GPU tracer leaks memory
     linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core",
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 0344bd9c978..6d7365882bc 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -415,18 +415,13 @@ Status DirectSession::Run(const RunOptions& run_options,
   const int64 build_cost_model =
       options_.config.graph_options().build_cost_model();
   if (do_trace || build_cost_model > 0) {
-    run_state.collector.reset(new StepStatsCollector(
-        run_metadata->mutable_step_stats(),
-        (build_cost_model > 0) ? &cost_model_manager_ : nullptr));
+    run_state.collector.reset(
+        new StepStatsCollector(run_metadata->mutable_step_stats()));
     args.stats_collector = run_state.collector.get();
   }
 
-  // TODO(pbar) CostModel still gets very confused when presented
-  // with trace data from the GPUTracer. This will need fixing if the
-  // cost model needs meaningful GPU timing information.
   std::unique_ptr<GPUTracer> tracer;
-  if (!build_cost_model &&
-      run_options.trace_level() >= RunOptions::HARDWARE_TRACE) {
+  if (run_options.trace_level() >= RunOptions::HARDWARE_TRACE) {
     tracer.reset(CreateGPUTracer());
     // tracer will be NULL on non-GPU platforms.
     if (tracer) tracer->Start();
@@ -462,6 +457,17 @@ Status DirectSession::Run(const RunOptions& run_options,
   mutex_lock l(executor_lock_);
   ++executors_and_keys->step_count;
   if (executors_and_keys->step_count == build_cost_model) {
+    // Build the cost model
+    std::unordered_map<string, const Graph*> device_to_graph;
+    for (const PerPartitionExecutorsAndLib& partition :
+         executors_and_keys->items) {
+      const Graph* graph = partition.graph;
+      const string device = partition.flib->device()->name();
+      device_to_graph[device] = graph;
+    }
+    args.stats_collector->BuildCostModel(&cost_model_manager_, device_to_graph);
+
+    // annotate stats onto cost graph.
     CostGraphDef* cost_graph = run_metadata->mutable_cost_graph();
     for (const auto& item : executors_and_keys->items) {
       TF_RETURN_IF_ERROR(
@@ -547,8 +553,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
   }
 
   if (options_.config.graph_options().build_cost_model()) {
-    run_state->collector.reset(
-        new StepStatsCollector(nullptr, &cost_model_manager_));
+    run_state->collector.reset(new StepStatsCollector(nullptr));
     args.stats_collector = run_state->collector.get();
   }
 
@@ -647,6 +652,7 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
       partial_runs_.erase(handle);
     }
   }
+
   return s;
 }
 
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 99100ed39c5..6f0f12496f3 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -109,5 +109,87 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
   ASSERT_EQ(2, graph_cnt);
 }
 
+static void TestHWAccelerator(bool enableHWTrace) {
+  EnableCPUAllocatorFullStats(true);
+
+  Graph graph(OpRegistry::Global());
+
+  Tensor a_tensor(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a_tensor, {3, 2, -1, 0});
+  Node* a = test::graph::Constant(&graph, a_tensor);
+  a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
+
+  Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
+  test::FillValues<float>(&x_tensor, {1, 1});
+  Node* x = test::graph::Constant(&graph, x_tensor);
+  x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+
+  // y = A * x
+  Node* y = test::graph::Matmul(&graph, a, x, false, false);
+  y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+
+  Node* y_neg = test::graph::Unary(&graph, "Neg", y);
+  y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
+
+  GraphDef def;
+  test::graph::ToGraphDef(&graph, &def);
+
+  SessionOptions options;
+  (*options.config.mutable_device_count())["CPU"] = 1;
+  (*options.config.mutable_device_count())["GPU"] = 1;
+  options.config.set_allow_soft_placement(true);
+  options.config.mutable_graph_options()->set_build_cost_model(true);
+  std::unique_ptr<Session> session(NewSession(options));
+  TF_ASSERT_OK(session->Create(def));
+  std::vector<std::pair<string, Tensor>> inputs;
+
+  // Request two targets: one fetch output and one non-fetched output.
+  std::vector<string> output_names = {y->name() + ":0"};
+  std::vector<string> target_nodes = {y_neg->name()};
+  std::vector<Tensor> outputs;
+  const int64 start_micros = Env::Default()->NowMicros();
+
+  RunOptions run_options;
+  if (enableHWTrace) {
+    run_options.set_trace_level(RunOptions::FULL_TRACE);
+  }
+  RunMetadata run_metadata;
+  Status s = session->Run(run_options, inputs, output_names, target_nodes,
+                          &outputs, &run_metadata);
+  const int64 run_duration_micros = Env::Default()->NowMicros() - start_micros;
+  TF_ASSERT_OK(s);
+
+  DirectSession* ds = static_cast<DirectSession*>(session.get());
+  int graph_cnt = 0;
+  CostModelManager::CostModelMap cost_models;
+  ds->ExportCostModels(&cost_models);
+  for (auto& it : cost_models) {
+    const Graph* g = (it).first;
+    const CostModel* cm = (it).second;
+    for (Node* node : g->nodes()) {
+      if (node->name() == y->name()) {
+        EXPECT_LE(8, cm->MaxMemorySize(node, 0));
+      } else if (node->name() == y_neg->name()) {
+        EXPECT_LE(8, cm->MaxMemorySize(node, 0));
+      }
+      EXPECT_LE(0, cm->MaxExecutionTime(node));
+      EXPECT_GE(run_duration_micros, cm->MaxExecutionTime(node));
+    }
+    graph_cnt++;
+  }
+  // We should have 2 cost models since we requested 1 cpu and 1 gpu. However
+  // since the placement is soft, we might end up placing everything on cpu.
+  ASSERT_GE(2, graph_cnt);
+  ASSERT_LE(1, graph_cnt);
+}
+
+TEST(DirectSessionWithTrackingAllocTest, CostModelForAccelerator) {
+  TestHWAccelerator(false);
+}
+
+TEST(DirectSessionWithTrackingAllocTest, CostModelWithHardwareStats) {
+  TestHWAccelerator(true);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 9261414b69e..3891658876c 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1611,7 +1611,6 @@ bool ExecutorState::NodeDone(const Status& s, const Node* node,
                              TaggedNodeReadyQueue* inline_ready) {
   if (stats_collector_) {
     nodestats::SetAllEnd(stats);
-    stats_collector_->UpdateCostModelNode(stats, impl_->graph_, node);
     if (!SetTimelineLabel(node, stats)) {
       // Only record non-transfer nodes.
       stats_collector_->Save(impl_->params_.device->name(), stats);
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index a5907638297..278a6b3f9f8 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -54,7 +54,6 @@ class Benchmark {
 
  private:
   thread::ThreadPool* pool_ = nullptr;
-  thread::ThreadPool* non_blocking_pool_ = nullptr;
   Device* device_ = nullptr;
   Rendezvous* rendez_ = nullptr;
   Executor* exec_ = nullptr;
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index f6c55b46315..8fc5613ced1 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -12,35 +12,165 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/common_runtime/step_stats_collector.h"
 
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/common_runtime/costmodel_manager.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/graph/costmodel.h"
+#include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 
-StepStatsCollector::StepStatsCollector(StepStats* ss,
-                                       CostModelManager* cost_model_manager)
-    : step_stats_(ss), cost_model_manager_(cost_model_manager) {}
+StepStatsCollector::StepStatsCollector(StepStats* ss) : step_stats_(ss) {}
 
-void StepStatsCollector::UpdateCostModelNode(const NodeExecStats* nt,
-                                             const Graph* graph,
-                                             const Node* node) {
-  mutex_lock l(mu_);
-  if (cost_model_manager_ != nullptr) {
-    CostModel* cm = cost_model_manager_->FindOrCreateCostModel(graph);
-    cm->RecordMaxExecutionTime(node, Microseconds(nt->op_end_rel_micros()));
+static int ExtractGpuWithStreamAll(string device_name) {
+  // Check if the device name matches the ".*gpu:(\\d+)/stream:all$" regexp,
+  // and if it does return the stream index (always positive). If it doesn't
+  // return -1.
 
-    for (int i = 0; i < nt->output_size(); ++i) {
-      cm->RecordMaxMemorySize(node, i, Bytes(nt->output(i)
-                                                 .tensor_description()
-                                                 .allocation_description()
-                                                 .allocated_bytes()));
-      cm->RecordAllocationId(node, i, nt->output(i)
-                                          .tensor_description()
-                                          .allocation_description()
-                                          .allocation_id());
+  // The best way to parse this regexp using a scanner is to parse it in
+  // reverse starting from the end.
+  std::reverse(device_name.begin(), device_name.end());
+  strings::Scanner scanner(device_name);
+  // Check that the string end with '/stream:all'
+  scanner.OneLiteral("lla:maerts/");
+  // Capture the digits if present
+  scanner.RestartCapture().Many(strings::Scanner::DIGIT).StopCapture();
+  // Check that the digits are preceded by the 'gpu:' string
+  scanner.OneLiteral(":upg");
+  StringPiece capture;
+  bool matched = scanner.GetResult(nullptr, &capture);
+
+  if (!matched) {
+    return -1;
+  } else {
+    // Convert the captured string into an integer. But first we need to put
+    // the digits back in order
+    string ordered_capture = capture.ToString();
+    std::reverse(ordered_capture.begin(), ordered_capture.end());
+    int gpu_id;
+    CHECK(strings::safe_strto32(ordered_capture, &gpu_id));
+    return gpu_id;
+  }
+}
+
+static int ExtractGpuWithoutStream(string device_name) {
+  // Check if the device name matches the ".*gpu:(\\d+)$" regexp,
+  // and if it does return the stream index (always positive). If it doesn't
+  // return -1.
+
+  // The best way to parse this regexp using a scanner is to parse it in
+  // reverse starting from the end.
+  std::reverse(device_name.begin(), device_name.end());
+  strings::Scanner scanner(device_name);
+  // Capture the trailing digits if present
+  scanner.RestartCapture().Many(strings::Scanner::DIGIT).StopCapture();
+  // Check that the digits are preceded by the 'gpu:' string
+  scanner.OneLiteral(":upg");
+  StringPiece capture;
+  bool matched = scanner.GetResult(nullptr, &capture);
+
+  if (!matched) {
+    return -1;
+  } else {
+    // Convert the captured string into an integer. But first we need to put
+    // the digits back in order
+    string ordered_capture = capture.ToString();
+    std::reverse(ordered_capture.begin(), ordered_capture.end());
+    int gpu_id;
+    CHECK(strings::safe_strto32(ordered_capture, &gpu_id));
+    return gpu_id;
+  }
+}
+
+void StepStatsCollector::BuildCostModel(
+    CostModelManager* cost_model_manager,
+    const std::unordered_map<string, const Graph*>& device_map) {
+  mutex_lock lock(mu_);
+
+  // Hardware stats for gpu are available under a fake device named
+  // "gpu:<id>/stream::all.
+  // Use them instead of regular stats whenever they're available to extract
+  // the execution stats of a particular node since they're more accurate.
+  // However hardware traces don't record memory usage, so we still have to
+  // rely on regular traces to track memory usage.
+  struct DeviceStats {
+    const DeviceStepStats* regular_stats;
+    const DeviceStepStats* hardware_stats;
+  };
+
+  std::unordered_map<string, DeviceStats> per_device_stats;
+  std::unordered_map<int, const DeviceStepStats*> gpu_hardware_stats;
+
+  for (int i = 0; i < step_stats_->dev_stats_size(); ++i) {
+    const DeviceStepStats& device_stats = step_stats_->dev_stats(i);
+    const string device_name = device_stats.device();
+    const int gpu_id = ExtractGpuWithStreamAll(device_name);
+    if (gpu_id >= 0) {
+      // These are gpu hardware stats
+      gpu_hardware_stats[gpu_id] = &device_stats;
+    } else {
+      // The are regular stats.
+      per_device_stats[device_name] = DeviceStats{&device_stats, nullptr};
+    }
+  }
+
+  for (auto& itr : per_device_stats) {
+    const string& device_name = itr.first;
+    const int gpu_id = ExtractGpuWithoutStream(device_name);
+    if (gpu_id >= 0) {
+      // Reference the gpu hardware stats in addition to the regular stats
+      // for this gpu device if they're available.
+      if (gpu_hardware_stats.find(gpu_id) != gpu_hardware_stats.end()) {
+        itr.second.hardware_stats = gpu_hardware_stats.find(gpu_id)->second;
+      }
+    }
+  }
+
+  for (auto itr : device_map) {
+    const string device = itr.first;
+    if (per_device_stats.find(device) == per_device_stats.end()) {
+      continue;
+    }
+
+    const Graph* graph = itr.second;
+    CostModel* cm = cost_model_manager->FindOrCreateCostModel(graph);
+
+    std::unordered_map<string, Node*> name_to_node;
+    for (Node* n : graph->nodes()) {
+      name_to_node[n->name()] = n;
+    }
+
+    const DeviceStats& dev_stats = per_device_stats.find(device)->second;
+
+    for (int i = 0; i < dev_stats.regular_stats->node_stats_size(); ++i) {
+      const NodeExecStats& stats = dev_stats.regular_stats->node_stats(i);
+      const Node* node = name_to_node[stats.node_name()];
+      if (node) {
+        for (int i = 0; i < stats.output_size(); ++i) {
+          cm->RecordMaxMemorySize(node, i, Bytes(stats.output(i)
+                                                     .tensor_description()
+                                                     .allocation_description()
+                                                     .allocated_bytes()));
+          cm->RecordAllocationId(node, i, stats.output(i)
+                                              .tensor_description()
+                                              .allocation_description()
+                                              .allocation_id());
+        }
+        // Use hardware stats to record the execution time if they're available,
+        // otherwise use the regular (less accurate) stats
+        if (dev_stats.hardware_stats &&
+            i < dev_stats.hardware_stats->node_stats_size()) {
+          const NodeExecStats& hw_stats =
+              dev_stats.hardware_stats->node_stats(i);
+          cm->RecordMaxExecutionTime(
+              node, Microseconds(hw_stats.op_end_rel_micros()));
+        } else {
+          cm->RecordMaxExecutionTime(node,
+                                     Microseconds(stats.op_end_rel_micros()));
+        }
+      }
     }
   }
 }
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 84250a1bdfd..8b71b6a0e33 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -16,26 +16,24 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_STEP_STATS_COLLECTOR_H_
 
 #include <unordered_map>
-#include "tensorflow/core/common_runtime/costmodel_manager.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-class CostModel;
+class CostModelManager;
 class Graph;
-class Node;
 class NodeExecStats;
 class StepStats;
 
 class StepStatsCollector {
  public:
-  explicit StepStatsCollector(StepStats* ss,
-                              CostModelManager* cost_model_manager = nullptr);
+  explicit StepStatsCollector(StepStats* ss);
 
-  void UpdateCostModelNode(const NodeExecStats* nt, const Graph* graph,
-                           const Node* node);
+  void BuildCostModel(
+      CostModelManager* cost_model_manager,
+      const std::unordered_map<string, const Graph*>& device_map);
 
   void Save(const string& device, NodeExecStats* nt);
 
@@ -44,7 +42,6 @@ class StepStatsCollector {
  private:
   mutex mu_;
   StepStats* step_stats_ GUARDED_BY(mu_);
-  CostModelManager* cost_model_manager_ GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index e12646325e2..30c260d7d12 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -130,7 +130,6 @@ BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id,
                                            bool tolerate_dup_recv)
     : env_(env),
       step_id_(step_id),
-      tolerate_dup_recv_(tolerate_dup_recv),
       local_(NewLocalRendezvous(tolerate_dup_recv)) {}
 
 BaseRemoteRendezvous::~BaseRemoteRendezvous() {
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index b208c0f8742..2d939f12f2c 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -162,7 +162,6 @@ class BaseRemoteRendezvous : public Rendezvous {
   const int64 step_id_;
 
  private:
-  const bool tolerate_dup_recv_;
   Rendezvous* local_;  // Owns a Ref on this object.
 
   mutable mutex mu_;
diff --git a/tensorflow/core/distributed_runtime/remote_device_test.cc b/tensorflow/core/distributed_runtime/remote_device_test.cc
index b2fc6d73b7c..5ad88a581fb 100644
--- a/tensorflow/core/distributed_runtime/remote_device_test.cc
+++ b/tensorflow/core/distributed_runtime/remote_device_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -46,14 +45,11 @@ class RemoteDeviceTest : public ::testing::Test {
     (*options.config.mutable_device_count())["CPU"] = 2;
     TF_CHECK_OK(test::TestCluster::MakeTestCluster(options, 1, &cluster_));
     const string& hostport = cluster_->targets()[0];
-    string host;
-    int port;
-    CHECK(RE2::FullMatch(hostport, "(.+):(\\d+)", &host, &port));
     GrpcChannelSpec spec;
     spec.AddHostPortsJob("localhost", {hostport});
     worker_cache_.reset(
         NewGrpcWorkerCache(NewGrpcChannelCache(spec, NewHostPortGrpcChannel)));
-    remote_name_ = strings::StrCat("/job:", host, "/replica:0/task:0");
+    remote_name_ = "/job:localhost/replica:0/task:0";
     wi_.reset(worker_cache_->CreateWorker(remote_name_));
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index b231f48079e..22c58f55a3b 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -91,6 +91,7 @@ cc_library(
     hdrs = ["grpc_channel.h"],
     deps = [
         ":grpc_util",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@grpc//:grpc++_unsecure",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index e29e0ef7011..eb188a79842 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -24,14 +24,15 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 
@@ -53,10 +54,11 @@ SharedGrpcChannelPtr NewHostPortGrpcChannel(const string& target) {
 
 namespace {
 Status ValidateHostPortPair(const string& host_port) {
-  const static RE2* kHostPortRE = new RE2("([^:/]+):(\\d+)");
-  string host;
-  int port;
-  if (!RE2::FullMatch(host_port, *kHostPortRE, &host, &port)) {
+  uint32 port;
+  std::vector<string> parts = str_util::Split(host_port, ':');
+  // Must be host:port, port must be a number, host must not contain a '/'.
+  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
+      parts[0].find("/") != string::npos) {
     return errors::InvalidArgument("Could not interpret \"", host_port,
                                    "\" as a host-port pair.");
   }
@@ -204,23 +206,20 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   }
 
   string TranslateTask(const string& target) override {
-    const static RE2* kTargetRE =
-        new RE2("^/job:([^/]+)/replica:([0-9]+)/task:([0-9]+)$");
-
-    RegexpStringPiece job;
-    int32 replica;
-    int32 task;
-    if (!RE2::FullMatch(target, *kTargetRE, &job, &replica, &task)) {
+    DeviceNameUtils::ParsedName parsed;
+    if (!DeviceNameUtils::ParseFullName(target, &parsed)) {
       LOG(WARNING) << "Invalid target: " << target;
       return "";
     }
-    if (job != job_id_) {
+
+    if (!parsed.has_job || parsed.job != job_id_) {
       return "";
     }
-    if (replica != 0) {
+    if (!parsed.has_replica || parsed.replica != 0) {
       LOG(WARNING) << "Replica ID must be 0 in target: " << target;
       return "";
     }
+    int32 task = parsed.has_task ? parsed.task : -1;
     auto iter = host_ports_.find(task);
     if (iter == host_ports_.end()) {
       LOG(WARNING) << "Task " << task << " was not defined in sparse job "
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 60597aab262..657113e01ef 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -250,7 +250,6 @@ class GrpcRemoteWorker : public WorkerInterface {
 
   // Support for logging.
   WorkerCacheLogger* logger_;
-  bool retry_unavailable_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcRemoteWorker);
 };
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 4015eef07ba..b6b05b7c304 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -23,7 +23,7 @@ class ServerBuilder;
 namespace tensorflow {
 
 class AsyncServiceInterface;
-class WorkerEnv;
+struct WorkerEnv;
 
 // Returns an implementation of WorkerService rpc service.
 AsyncServiceInterface* NewGrpcWorkerService(WorkerEnv* env,
diff --git a/tensorflow/core/framework/fake_input.cc b/tensorflow/core/framework/fake_input.cc
index 1daf70cd2f3..7a21dd5066c 100644
--- a/tensorflow/core/framework/fake_input.cc
+++ b/tensorflow/core/framework/fake_input.cc
@@ -172,7 +172,7 @@ void FakeInputImpl::NSources(int n, DataType dt) const {
   for (int i = 0; i < n; ++i) {
     srcs.emplace_back(in_node_, i, dt);
   }
-  builder_->Input(srcs);
+  builder_->Input(gtl::ArraySlice<NodeDefBuilder::NodeOut>(srcs));
 }
 
 void FakeInputImpl::SourceList(DataTypeSlice dts) const {
@@ -181,7 +181,7 @@ void FakeInputImpl::SourceList(DataTypeSlice dts) const {
   for (size_t i = 0; i < dts.size(); ++i) {
     srcs.emplace_back(in_node_, i, dts[i]);
   }
-  builder_->Input(srcs);
+  builder_->Input(gtl::ArraySlice<NodeDefBuilder::NodeOut>(srcs));
 }
 
 }  // namespace
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index ffc75aed2bc..39163522f85 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -56,8 +56,8 @@ namespace tensorflow {
 //   // Create a var.
 //   MyVar* my_var = new MyVar;
 //   my_var.val = Tensor(DT_FLOAT, my_shape);
-//   my_val.val.flat<float>().setZeros();   // 0 initialized.
-//   ctx->SetStatus(rm.Create("my_container", "my_name", my_val));
+//   my_var.val.flat<float>().setZeros();   // 0 initialized.
+//   ctx->SetStatus(rm.Create("my_container", "my_name", my_var));
 //
 //   // += a variable.
 //   MyVar* my_var = nullptr;
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index df0c14c1a55..5d9dfd6699c 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -430,6 +430,8 @@ void CostModel::AddToCostGraphDef(const Graph* graph,
 
     cnode->set_temporary_memory_size(TempMemorySize(n).value());
 
+    cnode->set_compute_cost(MaxExecutionTime(n).value());
+
     // For now we treat all send nodes as final.
     // TODO(yuanbyu): Send nodes for fetches shouldn't be treated as final.
     cnode->set_is_final(n->IsSend());
diff --git a/tensorflow/core/graph/dot.cc b/tensorflow/core/graph/dot.cc
deleted file mode 100644
index fc064cb9c9e..00000000000
--- a/tensorflow/core/graph/dot.cc
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/graph/dot.h"
-
-#include <map>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "tensorflow/core/graph/colors.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/core/util/util.h"
-
-namespace tensorflow {
-
-static string GraphNodeName(const DotOptions& opts, const Node* n) {
-  return strings::StrCat("N", n->id());
-}
-
-bool ShouldDisplayOpType(const Node* n) {
-  if (n->type_string() == "NoOp") {
-    return false;
-  }
-  const string& op_name = n->def().name();
-  if (op_name.find(n->type_string() + "_") == 0) {
-    return false;
-  }
-  return true;
-}
-
-string DotGraph(const Graph& g, const DotOptions& opts) {
-  RegexpStringPiece flag(opts.prefix_collapse_regexp);
-  if (flag == "all") {
-    flag = ".";
-  } else if (flag == "none") {
-    flag = "^$";
-  }
-  RE2 cluster_name_pattern(flag);
-  string result;
-  strings::StrAppend(&result, "digraph G {\n");
-  strings::StrAppend(&result, "rankdir=\"BT\"\n");
-
-  std::map<string, int> device_index;       // Map from device name to index.
-  std::unordered_set<Node*> visible_nodes;  // Nodes to display.
-  // Cluster name => set of nodes.
-  std::unordered_map<string, std::unordered_set<Node*> > clusters;
-  // Node* => Cluster
-  std::unordered_map<Node*, string> node_cluster;
-  for (Node* src : g.nodes()) {
-    if (opts.include_node_function != nullptr &&
-        !opts.include_node_function(src)) {
-      continue;
-    }
-    // Do not display source and sink nodes
-    if (src->IsSource() || src->IsSink()) {
-      continue;
-    }
-    visible_nodes.insert(src);
-    const string name_prefix = NodeNamePrefix(src->def().name()).ToString();
-    if (!name_prefix.empty()) {
-      clusters[name_prefix].insert(src);
-      node_cluster[src] = name_prefix;
-    }
-    // Record device if present.
-    if (src->IsOp()) {
-      const string& d = src->assigned_device_name();
-      if (!d.empty()) {
-        device_index[d] = -1;  // Assigned later
-      }
-    }
-  }
-
-  // Add nodes whose name is exactly a cluster name to the cluster itself.
-  for (Node* src : g.nodes()) {
-    if (node_cluster.count(src) == 0) {
-      const string name = src->def().name();
-      auto it = clusters.find(name);
-      if (it != clusters.end()) {
-        it->second.insert(src);
-        node_cluster[src] = name;
-      }
-    }
-  }
-
-  auto node_in_collapsed_cluster = [&node_cluster,
-                                    &cluster_name_pattern](Node* n) {
-    return node_cluster.count(n) > 0 &&
-           RE2::PartialMatch(node_cluster[n], cluster_name_pattern);
-  };
-
-  // Assign device indices in sorted order.
-  int num = 0;
-  for (auto& e : device_index) {
-    e.second = num++;
-  }
-
-  double total_node_cost = 0;
-  double avg_node_cost = 1;
-  if (opts.node_cost) {
-    int node_count = 0;
-    for (const Node* n : g.nodes()) {
-      total_node_cost += opts.node_cost(n);
-      ++node_count;
-    }
-    if (total_node_cost > 0) avg_node_cost = total_node_cost / node_count;
-  }
-
-  for (Node* src : g.nodes()) {
-    if (visible_nodes.count(src) == 0 || node_in_collapsed_cluster(src)) {
-      continue;
-    }
-    string label = src->name();
-    if (ShouldDisplayOpType(src)) {
-      // Append the op type if it is not directly deducible from the op name.
-      strings::StrAppend(&label, "\\n(", src->type_string(), ")");
-    }
-    const char* shape = "box";
-    const char* color = nullptr;
-    if (src->IsSource()) {
-      shape = "oval";
-    } else if (src->IsSink()) {
-      shape = "oval";
-    } else {
-      const string& d = src->assigned_device_name();
-
-      int dindex;
-      if (opts.node_color) {
-        dindex = opts.node_color(src);
-      } else {
-        dindex = (!d.empty()) ? device_index[d] : -1;
-      }
-
-      if (dindex >= 0) {
-        color = ColorFor(dindex);
-      }
-
-      shape = "box";
-    }
-
-    if (opts.node_label) {
-      string extra = opts.node_label(src);
-      if (!extra.empty()) {
-        strings::StrAppend(&label, "\\n", extra);
-      }
-    }
-
-    strings::StrAppend(&result, GraphNodeName(opts, src), "[shape=", shape,
-                       ", label=\"", label, "\"");
-    if (opts.node_cost && total_node_cost > 0) {
-      // Pick fontsize in range [8..40] so that area is proportional to cost.
-      const double cost = opts.node_cost(src);
-      const double relcost = fabs(cost / avg_node_cost);
-      // Average cost node has font size of 12.
-      const int fs = 8 + static_cast<int>(4.0 * std::min(sqrt(relcost), 8.0));
-      strings::StrAppend(&result, ", width=0, height=0, fontsize=", fs);
-      VLOG(2) << "Node: " << cost << " => " << relcost << " => " << fs;
-    }
-    if (color != nullptr) {
-      strings::StrAppend(&result, ", fillcolor=\"", color,
-                         "\", fontcolor=\"white\", style=\"filled\"");
-    }
-    strings::StrAppend(&result, "]\n");
-  }
-
-  for (auto c : clusters) {
-    const string& cluster_name = c.first;
-    const std::unordered_set<Node*> nodes = c.second;
-    std::unordered_map<string, int> node_colors;
-    for (auto n : nodes) {
-      const string& d = n->assigned_device_name();
-      const int dindex = (!d.empty()) ? device_index[d] : -1;
-      if (dindex >= 0) {
-        ++node_colors[ColorFor(dindex)];
-      }
-    }
-
-    string majority_color;
-    if (node_colors.empty()) {
-      majority_color = ColorFor(0);
-    } else {
-      majority_color = std::max_element(node_colors.begin(), node_colors.end(),
-                                        [](const std::pair<string, int>& x,
-                                           const std::pair<string, int>& y) {
-                                          return x.second < y.second;
-                                        })
-                           ->first;
-    }
-
-    if (!RE2::PartialMatch(cluster_name, cluster_name_pattern)) {
-      strings::StrAppend(&result, "subgraph cluster_", cluster_name, "{\n");
-      for (auto n : nodes) {
-        strings::StrAppend(&result, GraphNodeName(opts, n), ";\n");
-      }
-      strings::StrAppend(&result, "}\n");
-    } else {
-      strings::StrAppend(&result, cluster_name, " [shape=oval, fillcolor=\"",
-                         majority_color, "\", label=\"", cluster_name,
-                         "\", style=\"filled\", fontcolor=\"white\"]\n");
-    }
-  }
-
-  std::unordered_set<string> edge_drawn;
-
-  double max_edge_cost = 0;
-  double total_edge_cost = 0;
-  double avg_edge_cost = 1;
-  if (opts.edge_cost && g.edges().size()) {
-    for (const Edge* e : g.edges()) {
-      auto cost = opts.edge_cost(e);
-      total_edge_cost += cost;
-      max_edge_cost = std::max(max_edge_cost, cost);
-    }
-    avg_edge_cost = total_edge_cost / g.edges().size();
-  }
-  VLOG(2) << "Edge cost tot/max/avg: " << total_edge_cost << "/"
-          << max_edge_cost << "/" << avg_edge_cost;
-
-  for (const Edge* e : g.edges()) {
-    Node* src = e->src();
-    Node* dst = e->dst();
-    // If either endpoint isn't drawn in the graph, don't draw the edge
-    if (visible_nodes.count(src) == 0 || visible_nodes.count(dst) == 0) {
-      continue;
-    }
-
-    const string src_name = node_in_collapsed_cluster(src)
-                                ? node_cluster[src]
-                                : GraphNodeName(opts, src);
-    const string dst_name = node_in_collapsed_cluster(dst)
-                                ? node_cluster[dst]
-                                : GraphNodeName(opts, dst);
-    // Don't draw self edges
-    if (src_name == dst_name) {
-      continue;
-    }
-    // And previously drawn edges.
-    const string& edge_name = strings::StrCat(src_name, ":", dst_name);
-    if (edge_drawn.count(edge_name) > 0) {
-      continue;
-    }
-    edge_drawn.insert(edge_name);
-
-    strings::StrAppend(&result, src_name, " -> ", dst_name, "[");
-    string label;
-    if (e->IsControlEdge()) {
-      strings::StrAppend(&result, " style=dotted");
-    }
-    if (opts.edge_label) {
-      string label = opts.edge_label(e);
-      if (!label.empty()) {
-        strings::StrAppend(&result, " label=<", label, ">");
-      }
-    }
-    // Make edge widths proportional to amount of data transferred.
-    if (opts.edge_cost && max_edge_cost > 0) {
-      const double cost = opts.edge_cost(e);
-      const double relcost = fabs(cost / avg_edge_cost);
-      // Pick penwidth in range [1..6] so that width is proportional to cost.
-      const int pw = 1 + std::min(5, static_cast<int>(2.0 * relcost));
-      strings::StrAppend(&result, " penwidth=", pw);
-      // Use weight attributes [1..100] to keep heavier edges more vertical.
-      const int weight = 1 + std::min(99, static_cast<int>(100.0 * relcost));
-      strings::StrAppend(&result, " weight=", weight);
-      VLOG(2) << "Edge: " << cost << " => " << relcost << " => " << pw << "/"
-              << weight;
-    }
-
-    strings::StrAppend(&result, "]\n");
-  }
-  // Compute some statistics
-  int op_nodes = 0;
-  for (Node* n : g.nodes()) {
-    if (n->IsOp()) {
-      op_nodes++;
-    }
-  }
-
-  // Emit legend
-  strings::StrAppend(&result,
-                     "{ rank = source; Legend [shape=box, margin=0, label=<",
-                     "<TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\" ",
-                     "CELLPADDING=\"4\">", "<TR><TD COLSPAN=\"2\">op_nodes: ",
-                     op_nodes, "</TD></TR>\n");
-  for (const auto& e : device_index) {
-    const int dindex = e.second;
-    strings::StrAppend(&result, "<TR><TD BGCOLOR=\"", ColorFor(dindex),
-                       "\"><FONT COLOR=\"white\">", dindex, "</FONT></TD><TD>",
-                       e.first, "</TD></TR>\n");
-  }
-  strings::StrAppend(&result, "</TABLE>>]}\n");
-
-  strings::StrAppend(&result, "}\n");  // End digraph
-  return result;
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/graph/dot.h b/tensorflow/core/graph/dot.h
deleted file mode 100644
index d0efb0a610c..00000000000
--- a/tensorflow/core/graph/dot.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_GRAPH_DOT_H_
-#define TENSORFLOW_GRAPH_DOT_H_
-
-#include <functional>
-#include <string>
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-class Edge;
-class Graph;
-class Node;
-
-struct DotOptions {
-  bool (*include_node_function)(const Node*) = nullptr;
-
-  // By default, all nodes with the same name prefix are collapsed into
-  // a single node in the dot graph.  This regexp can be changed so that
-  // only prefixes that match the regexp are collapsed in this fashion.
-  // 'all' collapses all ops with prefixes, 'none' disables all collapsing.
-  string prefix_collapse_regexp = "all";
-
-  // A function that returns a label to embed into the per-node display.
-  std::function<string(const Node*)> node_label;
-
-  // A function that returns a label to attach to an edge.
-  std::function<string(const Edge*)> edge_label;
-
-  // A function that returns the "cost" of the node.  The dot display
-  // makes a node size proportional to its cost.
-  std::function<double(const Node*)> node_cost;
-
-  // A function that returns the "cost" of the edge.  The dot display
-  // makes a edge thickness proportional to its cost.
-  std::function<double(const Edge*)> edge_cost;
-
-  // A function that returns a color number to apply to each node. < 0 means
-  // no color. A color will be assigned to each color number from a palette;
-  // adjacent color numbers will receive different colors.
-  std::function<int(const Node*)> node_color;
-};
-
-// Return a string that contains a graphviz specification of the graph.
-string DotGraph(const Graph& g, const DotOptions& opts);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_GRAPH_DOT_H_
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 6020df92846..7acdfaa70a2 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "tensorflow/core/graph/graph_constructor.h"
 
+#include <algorithm>
+#include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -50,38 +54,88 @@ bool IsValidNodeName(StringPiece s, bool allow_internal_ops) {
 
 class GraphConstructor {
  public:
-  static Status Construct(const GraphConstructorOptions& opts,
-                          const GraphDef* gdef, Graph* g) {
+  struct Options {
+    Options(const GraphConstructorOptions& in)
+        : allow_internal_ops(in.allow_internal_ops),
+          expect_device_spec(in.expect_device_spec),
+          importing(false) {}
+    Options(const ImportGraphDefOptions& in)
+        : allow_internal_ops(false),
+          expect_device_spec(false),
+          prefix(in.prefix.empty() || StringPiece(in.prefix).ends_with("/")
+                     ? in.prefix
+                     : in.prefix + "/"),
+          importing(true) {}
+
+    bool allow_internal_ops;
+    bool expect_device_spec;
+
+    string prefix;
+    // TODO(ashankar): This bool exists to separate out functionality required
+    // to make ImportGraphDef a close equivalent of Python's import_graph_def
+    // without affecting the behavior of ConvertGraphDefToGraph at the time
+    // ImportGraphDef was added.
+    //
+    // That said, the functionality here (shape and op validation) seems
+    // applicable to ConvertGraphDefToGraph as well, so make an attempt to
+    // remove this.
+    bool importing;
+  };
+
+  static Status Construct(const Options& opts, const GraphDef* gdef, Graph* g,
+                          ShapeRefiner* refiner) {
     TF_RETURN_IF_ERROR(CheckVersions(gdef->versions(), TF_GRAPH_DEF_VERSION,
                                      TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
                                      "GraphDef", "graph"));
-    GraphConstructor c(opts, gdef, g);
-    g->set_versions(gdef->versions());
-    TF_RETURN_IF_ERROR(c.BuildNodeIndex());
-    TF_RETURN_IF_ERROR(c.InitFromEdges());
-    TF_RETURN_IF_ERROR(c.Convert());
-    TF_RETURN_IF_ERROR(c.AddBackEdges());
-    FixupSourceAndSinkEdges(g);
-    return Status::OK();
+    GraphConstructor c(opts, gdef, g, refiner);
+    const Status s = c.TryImport();
+    if (!s.ok()) c.Undo();
+    return s;
   }
 
  private:
-  GraphConstructor(const GraphConstructorOptions& opts, const GraphDef* gdef,
-                   Graph* g)
-      : opts_(opts), gdef_(gdef), g_(g) {}
+  GraphConstructor(const Options& opts, const GraphDef* gdef, Graph* g,
+                   ShapeRefiner* refiner)
+      : opts_(opts),
+        gdef_(gdef),
+        g_(g),
+        original_versions_(g->versions()),
+        refiner_(refiner) {}
 
+  Status TryImport() {
+    TF_RETURN_IF_ERROR(EnsureNoNameCollisions());
+    TF_RETURN_IF_ERROR(BuildNodeIndex());
+    TF_RETURN_IF_ERROR(InitFromEdges());
+    TF_RETURN_IF_ERROR(Convert());
+    TF_RETURN_IF_ERROR(AddBackEdges());
+    TF_RETURN_IF_ERROR(UpdateVersionDef());
+    FixupSourceAndSinkEdges(g_);
+    return Status::OK();
+  }
+
+  Status EnsureNoNameCollisions();
   Status BuildNodeIndex();
   Status InitFromEdges();
   Status Convert();
   Status AddBackEdges();
+  Status UpdateVersionDef();
 
+  void Undo();
+
+  Status ValidateColocationConstraints(const NodeDef& node_def);
   Status MakeNode(const NodeDef& node_def, Node** node);
   Status MakeEdge(Node* src, int output_index, Node* dst, int input_index);
+  Status ValidateShape(Node* node);
+  Status ModifyNodeDefForImport(NodeDef* node_def);
+  void AddPrefixToNodeDef(NodeDef* node_def);
 
   // From constructor
-  const GraphConstructorOptions opts_;
+  const Options opts_;
   const GraphDef* gdef_;
   Graph* g_;
+  const VersionDef original_versions_;
+
+  ShapeRefiner* refiner_;
 
   // Mapping from node name to the index within gdef_
   struct NodeInfo {
@@ -129,6 +183,41 @@ class GraphConstructor {
   std::vector<EdgeInfo> back_edges_;
 };
 
+Status GraphConstructor::EnsureNoNameCollisions() {
+  if (opts_.prefix.empty() && opts_.importing) {
+    std::unordered_set<string> existing(g_->num_nodes());
+    for (const Node* n : g_->nodes()) {
+      existing.insert(n->name());
+    }
+    for (int n = 0; n < gdef_->node_size(); ++n) {
+      const string& name = gdef_->node(n).name();
+      if (existing.find(name) != existing.end()) {
+        return errors::InvalidArgument("Node '", name,
+                                       "' already exists in the Graph");
+      }
+    }
+  } else if (!opts_.prefix.empty()) {
+    // Importing nodes with a prefix. No nodes should exist with the same
+    // prefix.
+    StringPiece prefix_no_slash(opts_.prefix);
+    prefix_no_slash.remove_suffix(1);
+    if (!IsValidNodeName(prefix_no_slash, false)) {
+      return errors::InvalidArgument("Imported node name prefix '",
+                                     opts_.prefix,
+                                     "' would lead to invalid node names");
+    }
+    for (const Node* n : g_->nodes()) {
+      if (StringPiece(n->name()).starts_with(opts_.prefix)) {
+        return errors::InvalidArgument(
+            "Import node name prefix conflicts with names of nodes already in "
+            "the Graph, such as '",
+            n->name(), "'");
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status GraphConstructor::BuildNodeIndex() {
   // Validate the node names and add them to name_index_.
   for (int n = 0; n < gdef_->node_size(); ++n) {
@@ -198,6 +287,23 @@ Status GraphConstructor::InitFromEdges() {
   return Status::OK();
 }
 
+Status GraphConstructor::ValidateColocationConstraints(
+    const NodeDef& node_def) {
+  if (!opts_.importing) return Status::OK();
+  const auto iter = node_def.attr().find(kColocationAttrName);
+  if (iter == node_def.attr().end()) return Status::OK();
+  for (const string& c : iter->second.list().s()) {
+    StringPiece s(c);
+    if (s.Consume(kColocationGroupPrefix) &&
+        name_index_.find(s) == name_index_.end()) {
+      return errors::InvalidArgument(
+          "Node '", node_def.name(),
+          "' expects to be colocated with unknown node '", s, "'");
+    }
+  }
+  return Status::OK();
+}
+
 Status GraphConstructor::MakeNode(const NodeDef& node_def, Node** node) {
   // Add the node to the graph.
   Status status;
@@ -206,24 +312,131 @@ Status GraphConstructor::MakeNode(const NodeDef& node_def, Node** node) {
   if (opts_.expect_device_spec) {
     (*node)->set_assigned_device_name(node_def.device());
   }
-  name_index_[node_def.name()].node = *node;
   return Status::OK();
 }
 
-// Return the number of nodes in "g"
-int CountNodes(Graph* g) {
-  int nodes = 0;
-  for (Node* node : g->nodes()) {
-    VLOG(3) << node;  // Dummy use to avoid compiler warning
-    nodes++;
+Status GraphConstructor::ValidateShape(Node* node) {
+  if (!opts_.importing) return Status::OK();
+  TF_RETURN_IF_ERROR(refiner_->AddNode(node));
+  // For nodes with the _output_shapes atttribute, override the shape.
+  std::vector<TensorShapeProto> shape_attrs;
+  const char* kAttrName = "_output_shapes";
+  if (!GetNodeAttr(node->def(), kAttrName, &shape_attrs).ok()) {
+    // No _output_shapes attribute, the AddNode call above was sufficient.
+    return Status::OK();
+  }
+  auto* ic = refiner_->GetContext(node);
+  DCHECK(ic != nullptr)
+      << "ShapeRefiner::AddNode() should have created the InferenceContext";
+  if (shape_attrs.size() != node->num_outputs()) {
+    return errors::InvalidArgument(
+        "Node '", node->name(), "' has ", node->num_outputs(),
+        " outputs but the ", kAttrName, " attribute specifies shapes for ",
+        shape_attrs.size(), " outputs");
+  }
+  for (int i = 0; i < shape_attrs.size(); ++i) {
+    const TensorShapeProto& p = shape_attrs[i];
+    shape_inference::ShapeHandle h;
+    Status s = ic->MakeShapeFromShapeProto(p, &h);
+    if (!s.ok()) {
+      return errors::InvalidArgument("Node '", node->name(), " has an invalid ",
+                                     kAttrName, " attribute (shape #", i,
+                                     " error:'", s.error_message(), "'");
+    }
+    s = refiner_->SetShape(node, i, h);
+    if (!s.ok()) {
+      // If the output shape is incompatible with what is inferred
+      // by the graph for a very specific whitelist of ops, then we
+      // ignore this output shape.  This can happen if there is a
+      // bug in the shape function for some operation, and the
+      // serialized graph def has the incorrect shape set when
+      // running on a newer binary with the fixed shape function.
+      // This is an escape hatch that allows us to correct shape
+      // functions that are not critical to correct execution but
+      // would cause graphs to fail if imported after correcting.
+      //
+      // This can be removed after 2017/03/08.
+      const string& op = node->def().op();
+      const std::vector<string> whitelist = {"RandomShuffleQueue",
+                                             "PaddingFIFOQueue",
+                                             "FIFOQueue",
+                                             "PriorityQueue",
+                                             "QueueSize",
+                                             "Stack",
+                                             "Barrier",
+                                             "BarrierReadySize",
+                                             "BarrierIncompleteSize",
+                                             "HashTable",
+                                             "MutableHashTable",
+                                             "MutableHashTableOfTensors",
+                                             "Mutex",
+                                             "CuckooTable",
+                                             "IndexTable",
+                                             "WholeFileReader",
+                                             "TextLineReader",
+                                             "FixedLengthRecordReader",
+                                             "TFRecordReader",
+                                             "IdentityReader",
+                                             "RefSwitch",
+                                             "RefEnter",
+                                             "RefNextIteration",
+                                             "RefMerge",
+                                             "RefIdentity"};
+      if (std::find(whitelist.begin(), whitelist.end(), op) ==
+          whitelist.end()) {
+        return errors::InvalidArgument(
+            "Node '", node->name(), "' has an ", kAttrName,
+            " attribute inconsistent with the GraphDef for output #", i, ": ",
+            s.error_message());
+      }
+    }
+  }
+  node->ClearAttr(kAttrName);
+  return Status::OK();
+}
+
+Status GraphConstructor::ModifyNodeDefForImport(NodeDef* node_def) {
+  const OpDef* op_def;
+  TF_RETURN_IF_ERROR(g_->op_registry()->LookUpOpDef(node_def->op(), &op_def));
+  AddDefaultsToNodeDef(*op_def, node_def);
+  TF_RETURN_IF_ERROR(ValidateNodeDef(*node_def, *op_def));
+  TF_RETURN_IF_ERROR(CheckOpDeprecation(*op_def, TF_GRAPH_DEF_VERSION));
+  return Status::OK();
+}
+
+void GraphConstructor::AddPrefixToNodeDef(NodeDef* node_def) {
+  const string& prefix = opts_.prefix;
+  if (prefix.empty()) return;
+  node_def->set_name(strings::StrCat(prefix, node_def->name()));
+  // Update names of input nodes
+  for (int i = 0; i < node_def->input_size(); ++i) {
+    StringPiece input(node_def->input(i));
+    if (input.Consume("^")) {
+      node_def->set_input(i, strings::StrCat("^", prefix, input));
+    } else {
+      node_def->set_input(i, strings::StrCat(prefix, input));
+    }
+  }
+  // Update names of colocation groups
+  if (node_def->attr().find(kColocationAttrName) != node_def->attr().end()) {
+    auto* list =
+        node_def->mutable_attr()->at(kColocationAttrName).mutable_list();
+    for (int i = 0; i < list->s_size(); ++i) {
+      StringPiece v(list->s(i));
+      if (v.Consume(kColocationGroupPrefix)) {
+        list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix, v));
+      }
+    }
   }
-  return nodes;
 }
 
 Status GraphConstructor::Convert() {
   std::vector<InputInfo> inputs;
   int processed = 0;
   // Process the NodeDefs in topological order.
+  // (InitFromEdges() sets this up by filling in ready_ with nodes that have no
+  // inputs, pending_counts_ with the number of inputs for each node and
+  // outputs_ with the outputs of each node).
   while (!ready_.empty()) {
     int o = ready_.back();
     ready_.pop_back();
@@ -232,6 +445,7 @@ Status GraphConstructor::Convert() {
     inputs.clear();
     bool in_control_dependence = false;
     bool has_data_back_edge = false;
+    TF_RETURN_IF_ERROR(ValidateColocationConstraints(node_def));
     for (int i = 0; i < node_def.input_size(); ++i) {
       StringPiece input_name(node_def.input(i));
       if (input_name.Consume("^")) {
@@ -269,7 +483,19 @@ Status GraphConstructor::Convert() {
     }
 
     Node* node;
-    TF_RETURN_IF_ERROR(MakeNode(node_def, &node));
+    if (opts_.importing) {
+      // TODO(ashankar): The line below means an additional copy of the NodeDef,
+      // which can be expensive if the NodeDef contains large tensors in it.
+      // Might make sense to change the API for ImportGraphDef to take a mutable
+      // GraphDef* and avoid the copying.
+      NodeDef imported_node_def = node_def;
+      AddPrefixToNodeDef(&imported_node_def);
+      TF_RETURN_IF_ERROR(ModifyNodeDefForImport(&imported_node_def));
+      TF_RETURN_IF_ERROR(MakeNode(imported_node_def, &node));
+    } else {
+      TF_RETURN_IF_ERROR(MakeNode(node_def, &node));
+    }
+    name_index_[node_def.name()].node = node;
 
     // Add edges from inputs to *node to the graph.
     for (size_t i = 0; i < inputs.size(); ++i) {
@@ -284,6 +510,7 @@ Status GraphConstructor::Convert() {
         TF_RETURN_IF_ERROR(MakeEdge(inputs[i].node, inputs[i].index, node, i));
       }
     }
+    TF_RETURN_IF_ERROR(ValidateShape(node));
 
     // Update pending_count_ for outputs.
     for (size_t i = 0; i < outputs_[o].size(); ++i) {
@@ -319,6 +546,39 @@ Status GraphConstructor::AddBackEdges() {
   return Status::OK();
 }
 
+Status GraphConstructor::UpdateVersionDef() {
+  if (!opts_.importing) {
+    g_->set_versions(gdef_->versions());
+    return Status::OK();
+  }
+  VersionDef versions = g_->versions();
+  // This new graph is being "produced" by the binary invoking ImportGraphDef.
+  versions.set_producer(TF_GRAPH_DEF_VERSION);
+  versions.set_min_consumer(
+      std::max(versions.min_consumer(), gdef_->versions().min_consumer()));
+  if (gdef_->versions().bad_consumers_size() > 0) {
+    std::set<int> bad(versions.bad_consumers().begin(),
+                      versions.bad_consumers().end());
+    bad.insert(gdef_->versions().bad_consumers().begin(),
+               gdef_->versions().bad_consumers().end());
+    versions.clear_bad_consumers();
+    for (int v : bad) {
+      versions.add_bad_consumers(v);
+    }
+  }
+  g_->set_versions(versions);
+  return Status::OK();
+}
+
+void GraphConstructor::Undo() {
+  for (const auto& iter : name_index_) {
+    if (iter.second.node != nullptr) {
+      g_->RemoveNode(iter.second.node);
+    }
+  }
+  g_->set_versions(original_versions_);
+}
+
 Status GraphConstructor::MakeEdge(Node* src, int output_index, Node* dst,
                                   int input_index) {
   DataType src_out = src->output_type(output_index);
@@ -335,24 +595,21 @@ Status GraphConstructor::MakeEdge(Node* src, int output_index, Node* dst,
 
 }  // namespace
 
-// ----------------------------------------------------------------------------
-// GraphConstructorOptions functions
-// ----------------------------------------------------------------------------
-
-GraphConstructorOptions::GraphConstructorOptions() {}
-
-// ----------------------------------------------------------------------------
-// ConvertGraphDefToGraph
-// ----------------------------------------------------------------------------
-
 Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
                               const GraphDef& gdef, Graph* g) {
-  return GraphConstructor::Construct(opts, &gdef, g);
+  ShapeRefiner refiner(g->op_registry());
+  return GraphConstructor::Construct(opts, &gdef, g, &refiner);
+}
+
+Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
+                      Graph* g, ShapeRefiner* refiner) {
+  ShapeRefiner default_refiner(g->op_registry());
+  if (refiner == nullptr) {
+    refiner = &default_refiner;
+  }
+  return GraphConstructor::Construct(opts, &gdef, g, refiner);
 }
 
-// ----------------------------------------------------------------------------
-// CopyGraph
-// ----------------------------------------------------------------------------
 void CopyGraph(const Graph& src, Graph* dest) {
   for (Node* n : dest->nodes()) {
     CHECK(n->IsSource() || n->IsSink()) << "*dest must be empty";
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index e605cb45238..a949362a033 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
+class ShapeRefiner;
 
 // Options specific to constant folding optimizations.
 //
@@ -31,10 +32,14 @@ struct ConstantFoldingOptions {
   std::function<bool(const Node*)> consider = nullptr;
 };
 
-// Construct a graph *g out of a GraphDef gdef. Returns non-OK on
+// Construct a Graph *g out of a GraphDef gdef. Returns non-OK on
 // error, in which case *g is left in an incomplete state.
+//
+// *g is expected to be an empty graph (with no more than a source and sink
+// nodes) when provided to ConvertGraphDefToGraph. To enhance an existing Graph,
+// see ImportGraphDef.
 struct GraphConstructorOptions {
-  GraphConstructorOptions();
+  GraphConstructorOptions() {}
 
   // If true, allows internal ops in the GraphDef.
   bool allow_internal_ops = false;
@@ -49,6 +54,37 @@ struct GraphConstructorOptions {
 extern Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
                                      const GraphDef& gdef, Graph* g);
 
+// Add the graph in GraphDef gdef into an existing Graph *g.
+//
+// On error, returns non-OK and leaves *g unmodified.
+//
+// "shape_refiner" can be null. It should be non-null if the caller
+// intends to add additonal nodes to the graph after the import. This
+// allows the caller to validate shapes of those nodes (since
+// ShapeRefiner::AddNode must be called in topological order).
+//
+// TODO(ashankar): Push this mechanism and get rid of Session::Extend()
+// as a means of enhancing an existing Graph.
+struct ImportGraphDefOptions {
+  ImportGraphDefOptions() {}
+
+  // Name prefix to use for nodes imported from the GraphDef.  For example, if
+  // prefix="animals" and GraphDef contains a node "bunny" then the node will be
+  // named "animals/bunny" in *g.
+  string prefix;
+
+  // TODO(ashankar): Enable node rebinding (in Python's import_graph_def
+  // this is achieved by providing an input_map).
+  //
+  // TODO(ashankar): Enable handling of GraphDefs produced by newer binaries
+  // with ops that are not defined in the binary calling ImportGraphDef.
+  // Similar to the producer_op_list argument to import_graph_def in the
+  // python API.
+};
+extern Status ImportGraphDef(const ImportGraphDefOptions& opts,
+                             const GraphDef& gdef, Graph* g,
+                             ShapeRefiner* refiner);
+
 // Make a copy of "src" into "*dest".
 //
 // REQUIRES: "*dest" is a freshly allocated graph without any nodes or edges
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 4fcddc2052b..b61224bd2bb 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -16,7 +16,10 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 
 #include <vector>
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -24,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -37,40 +39,42 @@ namespace {
 
 class GraphConstructorTest : public ::testing::Test {
  protected:
-  GraphConstructorTest() : g_(new Graph(OpRegistry::Global())) {}
-  ~GraphConstructorTest() override {}
+  GraphConstructorTest() : graph_(OpRegistry::Global()) {}
 
   void Convert(const string& gdef_ascii) {
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &gdef_));
   }
 
-  void ExpectError(const string& gdef_ascii, const string& expected_error_re) {
+  void ExpectError(const string& gdef_ascii,
+                   const std::vector<string>& expected_error_strs) {
     Convert(gdef_ascii);
     GraphConstructorOptions opts;
-    Status status = ConvertGraphDefToGraph(opts, gdef_, g_.get());
+    Status status = ConvertGraphDefToGraph(opts, gdef_, &graph_);
     EXPECT_FALSE(status.ok());
-    EXPECT_TRUE(RE2::PartialMatch(status.error_message(), expected_error_re))
-        << status;
+
+    for (const string& error : expected_error_strs) {
+      EXPECT_TRUE(status.error_message().find(error) != string::npos)
+          << "Expected to find '" << error << "' in " << status;
+    }
   }
 
   void ExpectOK(const string& gdef_ascii) {
     Convert(gdef_ascii);
     GraphConstructorOptions opts;
-    TF_CHECK_OK(ConvertGraphDefToGraph(opts, gdef_, g_.get()));
+    TF_CHECK_OK(ConvertGraphDefToGraph(opts, gdef_, &graph_));
   }
 
   void ExpectVersions(int min_consumer, int producer) {
-    EXPECT_NE(nullptr, g_);
-    EXPECT_EQ(min_consumer, g_->versions().min_consumer())
+    EXPECT_EQ(min_consumer, graph_.versions().min_consumer())
         << "Expected min consumer " << min_consumer << ", got "
-        << g_->versions().min_consumer();
-    EXPECT_EQ(producer, g_->versions().producer()) << "Expected producer "
-                                                   << producer << ", got "
-                                                   << g_->versions().producer();
+        << graph_.versions().min_consumer();
+    EXPECT_EQ(producer, graph_.versions().producer())
+        << "Expected producer " << producer << ", got "
+        << graph_.versions().producer();
   }
 
   Node* FindNode(const string& name) {
-    for (Node* n : g_->nodes()) {
+    for (Node* n : graph_.nodes()) {
       if (n->name() == name) return n;
     }
     return nullptr;
@@ -78,65 +82,96 @@ class GraphConstructorTest : public ::testing::Test {
 
   bool HasNode(const string& name) { return FindNode(name) != nullptr; }
 
-  void ExpectNodes(const string& nodes) {
-    int count = 0;
-    std::vector<string> actual_nodes;
-    for (Node* n : g_->nodes()) {
-      if (n->IsOp()) {
-        count++;
-        actual_nodes.push_back(n->name());
-      }
-    }
-    std::sort(actual_nodes.begin(), actual_nodes.end());
-
-    LOG(INFO) << "Nodes present: " << str_util::Join(actual_nodes, " ");
-
-    std::vector<string> expected_nodes = str_util::Split(nodes, ',');
-    std::sort(expected_nodes.begin(), expected_nodes.end());
-    for (const string& s : expected_nodes) {
-      Node* n = FindNode(s);
-      EXPECT_TRUE(n != nullptr) << s;
-    }
-
-    EXPECT_TRUE(actual_nodes.size() == expected_nodes.size())
-        << "\nActual:   " << str_util::Join(actual_nodes, ",")
-        << "\nExpected: " << str_util::Join(expected_nodes, ",");
-  }
-
   bool HasEdge(const string& src, int src_out, const string& dst, int dst_in) {
-    for (const Edge* e : g_->edges()) {
+    for (const Edge* e : graph_.edges()) {
       if (e->src()->name() == src && e->src_output() == src_out &&
           e->dst()->name() == dst && e->dst_input() == dst_in)
         return true;
     }
     return false;
   }
+
   bool HasControlEdge(const string& src, const string& dst) {
     return HasEdge(src, Graph::kControlSlot, dst, Graph::kControlSlot);
   }
 
+  string ColocationGroup(const string& node) {
+    Node* n = nullptr;
+    for (Node* ni : graph_.nodes()) {
+      if (ni->name() == node) {
+        n = ni;
+        break;
+      }
+    }
+    if (n == nullptr) {
+      return "";
+    }
+    std::vector<string> value;
+    Status s = GetNodeAttr(n->def(), kColocationAttrName, &value);
+    if (!s.ok()) {
+      return "";
+    }
+    if (value.size() != 1) {
+      ADD_FAILURE()
+          << "ColocationGroup was written with the assumption of at most 1 "
+             "value for the _class attribute. Update it and its callers";
+      return "";
+    }
+    StringPiece loc(value[0]);
+    return loc.Consume(kColocationGroupPrefix) ? loc.ToString() : "";
+  }
+
+  string GraphDebugString() const {
+    GraphDef def;
+    graph_.ToGraphDef(&def);
+    return def.DebugString();
+  }
+
+  Graph graph_;
+
  private:
   GraphDef gdef_;
-  std::unique_ptr<Graph> g_;
 };
 
+Status Scalars(shape_inference::InferenceContext* c) {
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
 REGISTER_OP("ABC");
-REGISTER_OP("TestParams").Output("o: float");
+REGISTER_OP("TestParams").Output("o: float").SetShapeFn(Scalars);
 REGISTER_OP("TestInput").Output("a: float").Output("b: float");
-REGISTER_OP("TestMul").Input("a: float").Input("b: float").Output("o: float");
+REGISTER_OP("TestMul")
+    .Input("a: float")
+    .Input("b: float")
+    .Output("o: float")
+    .SetShapeFn(Scalars);
 REGISTER_OP("TestInt").Input("a: int32");
+REGISTER_OP("TestOneInputTwoOutputs")
+    .Input("x: float")
+    .Output("y: float")
+    .Output("z: float")
+    .SetShapeFn(Scalars);
+REGISTER_OP("TestOneInputOneOutput")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {float, int64}")
+    .SetShapeFn(shape_inference::UnchangedShape);
+REGISTER_OP("TestDefaultAttr")
+    .Attr("default_int: int=31415")
+    .SetShapeFn(shape_inference::NoOutputs);
 
 TEST_F(GraphConstructorTest, InvalidNodeName) {
   auto expect_invalid_name = [this](const char* name) {
     ExpectError(strings::StrCat("node { name: '", name, "' op: 'ABC' }"),
-                strings::StrCat("Node '", name,
-                                "': Node name contains invalid characters"));
+                {"Node name contains invalid characters"});
   };
 
   expect_invalid_name("a:b");
   expect_invalid_name("_abc");  // Can't start with '_'
   // Name is a\b, but proto text format escapes slashes so we use a\\b here.
-  // This works for ExpectError too, since re2 also treats \\ as one slash.
   expect_invalid_name(R"(a\\b)");
   expect_invalid_name("/a");
   expect_invalid_name("-a");
@@ -153,7 +188,7 @@ TEST_F(GraphConstructorTest, InvalidSourceNodeName) {
       "node { name: 'input' op: 'TestInput' }"
       "node { name: 't1' op: 'TestMul' input: 'W999' input: 'input' }",
 
-      "Unknown input node.*W999");
+      {"Unknown input node", "W999"});
 }
 
 TEST_F(GraphConstructorTest, InvalidSourceNodeIndex) {
@@ -162,7 +197,7 @@ TEST_F(GraphConstructorTest, InvalidSourceNodeIndex) {
       "node { name: 'input' op: 'TestInput' }"
       "node { name: 't1' op: 'TestMul' input: [ 'W1:1', 'input:1' ] }",
 
-      "Connecting to invalid output 1 of source node W1");
+      {"Connecting to invalid output 1 of source node W1"});
 }
 
 TEST_F(GraphConstructorTest, GraphWithCycle) {
@@ -171,7 +206,219 @@ TEST_F(GraphConstructorTest, GraphWithCycle) {
       "node { name: 't1' op: 'TestMul' input: [ 'input:0', 't2' ] }"
       "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }",
 
-      "cycle");
+      {"cycle"});
+}
+
+TEST_F(GraphConstructorTest, GraphWithOKCycle) {
+  // Test graph produced in python using:
+  /*
+     with tf.Graph().as_default():
+       i = tf.constant(0)
+       c = lambda i: tf.less(i, 10)
+       b = lambda i: tf.add(i, 1)
+       r = tf.while_loop(c, b, [i])
+       with open('/tmp/graph.txt', 'w') as f:
+         f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+  ExpectOK(R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/Add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 11
+}
+  )EOF");
 }
 
 TEST_F(GraphConstructorTest, TypeMismatch) {
@@ -179,13 +426,13 @@ TEST_F(GraphConstructorTest, TypeMismatch) {
       "node { name: 'input' op: 'TestInput' }"
       "node { name: 'int' op: 'TestInt' input: [ 'input' ] }",
 
-      "Input 0 of node int was passed float from input:0 incompatible with "
-      "expected int32.");
+      {"Input 0 of node int was passed float from input:0 incompatible with "
+       "expected int32."});
 }
 
 TEST_F(GraphConstructorTest, EmptyGraph) {
   ExpectOK("");
-  ExpectVersions(0, 0);  // The default GraphDef versions are 0
+  ExpectVersions(0, 0);
 }
 
 TEST_F(GraphConstructorTest, VersionGraph) {
@@ -197,20 +444,20 @@ TEST_F(GraphConstructorTest, VersionGraph) {
 
 TEST_F(GraphConstructorTest, LowVersion) {
   ExpectError(strings::StrCat("versions { producer: ", -1, " }"),
-              strings::StrCat(R"(^GraphDef producer version -1 below min )"
-                              "producer ",
-                              TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
-                              " supported by TensorFlow ", TF_VERSION_STRING,
-                              R"(\.  Please regenerate your graph\.$)"));
+              {strings::StrCat("GraphDef producer version -1 below min "
+                               "producer ",
+                               TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
+                               " supported by TensorFlow ", TF_VERSION_STRING,
+                               ".  Please regenerate your graph.")});
 }
 
 TEST_F(GraphConstructorTest, HighVersion) {
   const int version = TF_GRAPH_DEF_VERSION + 1;
   ExpectError(strings::StrCat("versions { min_consumer: ", version, " }"),
-              strings::StrCat(R"(^GraphDef min consumer version )", version,
-                              " above current version ", TF_GRAPH_DEF_VERSION,
-                              " for TensorFlow ", TF_VERSION_STRING,
-                              R"(\.  Please upgrade TensorFlow\.$)"));
+              {strings::StrCat("GraphDef min consumer version ", version,
+                               " above current version ", TF_GRAPH_DEF_VERSION,
+                               " for TensorFlow ", TF_VERSION_STRING,
+                               ".  Please upgrade TensorFlow.")});
 }
 
 TEST_F(GraphConstructorTest, BadVersion) {
@@ -219,9 +466,9 @@ TEST_F(GraphConstructorTest, BadVersion) {
   ExpectError(
       strings::StrCat("versions { producer: ", version, " bad_consumers: ", bad,
                       " }"),
-      strings::StrCat(
-          R"(^GraphDef disallows consumer version )", bad,
-          R"(\.  Please upgrade TensorFlow: this version is likely buggy\.$)"));
+      {strings::StrCat(
+          "GraphDef disallows consumer version ", bad,
+          ".  Please upgrade TensorFlow: this version is likely buggy.")});
 }
 
 TEST_F(GraphConstructorTest, SimpleModel) {
@@ -260,7 +507,527 @@ TEST_F(GraphConstructorTest, Error_ControlEdgeBeforeRealInput) {
       "node { name: 'input' op: 'TestInput' input: [ '^W1' ] }"
       "node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }"
       "node { name: 't2' op: 'TestMul' input: [ 'W1', '^t1', 'input:1' ] }",
-      "Node 't2': Control dependencies must come after regular dependencies");
+      {"Node 't2': Control dependencies must come after regular dependencies"});
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef) {
+  GraphDef def;
+  ImportGraphDefOptions opts;
+  const string& source = graph_.FindNodeId(Graph::kSourceId)->name();
+  const string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
+
+  // Importing an empty graph is fine.
+  Status s = ImportGraphDef(opts, def, &graph_, nullptr);
+  ASSERT_EQ(Status::OK(), s) << s;
+  EXPECT_EQ(2, graph_.num_nodes());
+  EXPECT_TRUE(HasControlEdge(source, sink));
+  EXPECT_EQ(1, graph_.num_edges());
+
+  bool parsed = protobuf::TextFormat::ParseFromString(
+      R"EOF(
+        node { name: "A" op: "TestParams" }
+        node { name: "X" op: "TestParams" }
+        node {
+          name: "B"
+          op: "TestOneInputTwoOutputs"
+          input: "A"
+          attr {
+            key: "_class"
+            value { list { s: "loc:@A" } }
+          }
+        }
+        node {
+          name: "C"
+          op: "TestOneInputTwoOutputs"
+          input: "B:1"
+          input: "^X"
+        }
+        node {
+          name: "D"
+          op: "TestMul"
+          input: "B:0"
+          input: "C:0"
+        })EOF",
+      &def);
+  ASSERT_TRUE(parsed);
+
+  // First import should work out fine.
+  s = ImportGraphDef(opts, def, &graph_, nullptr);
+  ASSERT_EQ(Status::OK(), s) << s;
+  EXPECT_EQ(5 + 2, graph_.num_nodes());  // Added nodes + source and sink
+  EXPECT_EQ("A", ColocationGroup("B"));
+  EXPECT_TRUE(HasEdge("A", 0, "B", 0));
+  EXPECT_TRUE(HasEdge("B", 1, "C", 0));
+  EXPECT_TRUE(HasEdge("B", 0, "D", 0));
+  EXPECT_TRUE(HasEdge("C", 0, "D", 1));
+  EXPECT_TRUE(HasControlEdge("X", "C"));
+  EXPECT_TRUE(HasControlEdge(source, sink));
+  EXPECT_TRUE(HasControlEdge(source, "A"));
+  EXPECT_TRUE(HasControlEdge(source, "X"));
+  EXPECT_TRUE(HasControlEdge("D", sink));
+  EXPECT_EQ(9, graph_.num_edges());
+
+  // Importing again should fail because of node name collissions.
+  s = ImportGraphDef(opts, def, &graph_, nullptr);
+  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+
+  // But succeed if a unique prefix is provided.
+  opts.prefix = "import";
+  s = ImportGraphDef(opts, def, &graph_, nullptr);
+  ASSERT_EQ(Status::OK(), s) << s;
+  EXPECT_EQ(
+      10 + 2,
+      graph_.num_nodes());  // Added nodes + original nodes + source and sink
+  EXPECT_EQ("A", ColocationGroup("B"));
+  EXPECT_EQ("import/A", ColocationGroup("import/B"));
+  EXPECT_TRUE(HasEdge("A", 0, "B", 0));
+  EXPECT_TRUE(HasEdge("B", 1, "C", 0));
+  EXPECT_TRUE(HasEdge("B", 0, "D", 0));
+  EXPECT_TRUE(HasEdge("C", 0, "D", 1));
+  EXPECT_TRUE(HasControlEdge("X", "C"));
+  EXPECT_TRUE(HasEdge("import/A", 0, "import/B", 0));
+  EXPECT_TRUE(HasEdge("import/B", 1, "import/C", 0));
+  EXPECT_TRUE(HasEdge("import/B", 0, "import/D", 0));
+  EXPECT_TRUE(HasEdge("import/C", 0, "import/D", 1));
+  EXPECT_TRUE(HasControlEdge("import/X", "import/C"));
+  EXPECT_TRUE(HasControlEdge(source, sink));
+  EXPECT_TRUE(HasControlEdge(source, "A"));
+  EXPECT_TRUE(HasControlEdge(source, "X"));
+  EXPECT_TRUE(HasControlEdge("D", sink));
+  EXPECT_TRUE(HasControlEdge(source, "import/A"));
+  EXPECT_TRUE(HasControlEdge(source, "import/X"));
+  EXPECT_TRUE(HasControlEdge("import/D", sink));
+  EXPECT_EQ(17, graph_.num_edges());
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_DefaultAttrs) {
+  GraphDef def;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "node{ name:'A' op:'TestDefaultAttr'}", &def));
+  Status s = ImportGraphDef(ImportGraphDefOptions(), def, &graph_, nullptr);
+  ASSERT_EQ(Status::OK(), s) << s;
+  Node* a = nullptr;
+  for (Node* n : graph_.nodes()) {
+    if (n->name() == "A") {
+      a = n;
+      break;
+    }
+  }
+  ASSERT_TRUE(a != nullptr);
+  int value = 0;
+  s = GetNodeAttr(a->def(), "default_int", &value);
+  ASSERT_EQ(Status::OK(), s) << s << " -- " << a->def().DebugString();
+  EXPECT_EQ(31415, value);
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_Versioning) {
+  GraphDef def;
+  const ImportGraphDefOptions opts;
+
+  def.mutable_versions()->set_producer(TF_GRAPH_DEF_VERSION_MIN_PRODUCER - 1);
+  Status s = ImportGraphDef(opts, def, &graph_, nullptr);
+  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+
+  def.mutable_versions()->Clear();
+  def.mutable_versions()->set_min_consumer(TF_GRAPH_DEF_VERSION + 1);
+  s = ImportGraphDef(opts, def, &graph_, nullptr);
+  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+
+  def.mutable_versions()->Clear();
+  def.mutable_versions()->add_bad_consumers(TF_GRAPH_DEF_VERSION);
+  s = ImportGraphDef(opts, def, &graph_, nullptr);
+  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+
+  def.mutable_versions()->Clear();
+  graph_.ToGraphDef(&def);
+  s = ImportGraphDef(opts, def, &graph_, nullptr);
+  EXPECT_EQ(Status::OK(), s) << s;
+
+  def.Clear();
+  const int original_min_consumer = graph_.versions().min_consumer();
+  def.mutable_versions()->set_min_consumer(original_min_consumer + 2);
+  def.mutable_versions()->add_bad_consumers(TF_GRAPH_DEF_VERSION - 1);
+  s = ImportGraphDef(opts, def, &graph_, nullptr);
+  EXPECT_EQ(Status::OK(), s) << s;
+  EXPECT_EQ(original_min_consumer + 2, graph_.versions().min_consumer());
+  ASSERT_EQ(1, graph_.versions().bad_consumers_size());
+  EXPECT_EQ(TF_GRAPH_DEF_VERSION - 1, graph_.versions().bad_consumers(0));
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_ShapeWhitelist) {
+  // Barrier's shape is an output vector of 2, but the graph says it's a vector
+  // of 1. This is currently whitelisted.
+  GraphDef def;
+  bool parsed = protobuf::TextFormat::ParseFromString(
+      R"EOF(
+      node {
+        name: "A"
+        op: "Barrier"
+        attr {
+          key: "_output_shapes"
+          value { list { shape {} } }
+        }
+        attr {
+          key: "component_types"
+          value { list { type: DT_FLOAT } }
+        }
+      }
+      )EOF",
+      &def);
+  ASSERT_TRUE(parsed);
+  Status s = ImportGraphDef(ImportGraphDefOptions(), def, &graph_, nullptr);
+  EXPECT_EQ(Status::OK(), s) << s;
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_WithCycle) {
+  // Test graph produced in python using:
+  /*
+     with tf.Graph().as_default():
+       i = tf.constant(0)
+       c = lambda i: tf.less(i, 10)
+       b = lambda i: tf.add(i, 1)
+       r = tf.while_loop(c, b, [i])
+       with open('/tmp/graph.txt', 'w') as f:
+         f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+  GraphDef def;
+  bool parsed = protobuf::TextFormat::ParseFromString(
+      R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/Add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 11
+}
+  )EOF",
+      &def);
+  ASSERT_TRUE(parsed);
+  Status s = ImportGraphDef(ImportGraphDefOptions(), def, &graph_, nullptr);
+  EXPECT_EQ(Status::OK(), s) << s;
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
+  GraphDef def;
+  NodeDefBuilder("scope/A", "TestParams").Finalize(def.add_node());
+  ImportGraphDefOptions opts;
+  const string& source = graph_.FindNodeId(Graph::kSourceId)->name();
+  const string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
+
+  Status s = ImportGraphDef(opts, def, &graph_, nullptr);
+  ASSERT_EQ(Status::OK(), s) << s;
+  EXPECT_EQ(3, graph_.num_nodes());  // 'scope/A', source and sink
+  EXPECT_TRUE(HasControlEdge(source, sink));
+  EXPECT_TRUE(HasControlEdge(source, "scope/A"));
+  EXPECT_TRUE(HasControlEdge("scope/A", sink));
+  EXPECT_EQ(3, graph_.num_edges());
+  const string original_graph_description = GraphDebugString();
+
+#define EXPECT_IMPORT_FAILURE(graph_def, options, expected_err)             \
+  do {                                                                      \
+    Status s = ImportGraphDef(options, graph_def, &graph_, nullptr);        \
+    EXPECT_NE(Status::OK(), s) << s;                                        \
+    EXPECT_TRUE(s.error_message().find(expected_err) != string::npos) << s; \
+    const string graph_description = GraphDebugString();                    \
+    EXPECT_EQ(original_graph_description, graph_description);               \
+    EXPECT_EQ(3, graph_.num_nodes());                                       \
+    EXPECT_TRUE(HasControlEdge(source, sink));                              \
+    EXPECT_TRUE(HasControlEdge(source, "scope/A"));                         \
+    EXPECT_TRUE(HasControlEdge("scope/A", sink));                           \
+    EXPECT_EQ(3, graph_.num_edges());                                       \
+  } while (0)
+
+  EXPECT_IMPORT_FAILURE(def, opts,
+                        "Node 'scope/A' already exists in the Graph");
+
+  GraphDef bad_def;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "node{name:'!B' op:'TestParams'}", &bad_def));
+  EXPECT_IMPORT_FAILURE(bad_def, opts,
+                        "Node '!B': Node name contains invalid characters");
+
+  opts.prefix = "!bad_prefix";
+  EXPECT_IMPORT_FAILURE(def, opts,
+                        "Imported node name prefix '!bad_prefix/' would lead "
+                        "to invalid node names");
+
+  opts.prefix = "import";
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "node{name:'B' op:'SomeUnknownOp'}", &bad_def));
+  EXPECT_IMPORT_FAILURE(bad_def, opts,
+                        "Op type not registered 'SomeUnknownOp'");
+
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "node{name:'B' op:'TestOneInputTwoOutputs' input:'C'}", &bad_def));
+  EXPECT_IMPORT_FAILURE(bad_def, opts, "Node 'B': Unknown input node 'C'");
+
+  bool parsed = protobuf::TextFormat::ParseFromString(
+      R"EOF(
+      node{ name:"Root" op:"TestParams" } # TestParams produces a float
+      node{
+        name:"Integer"
+        op:"TestOneInputOneOutput"
+        attr{ key:"T" value{ type:DT_INT64 } }
+        input: "Root"
+      }
+      )EOF",
+      &bad_def);
+  ASSERT_TRUE(parsed);
+  EXPECT_IMPORT_FAILURE(bad_def, opts,
+                        "Input 0 of node import/Integer was passed float from "
+                        "import/Root:0 incompatible with expected int64");
+
+  parsed = protobuf::TextFormat::ParseFromString(
+      R"EOF(
+      node{ name:"A" op:"TestParams" }
+      node{ name:"B" op:"TestOneInputTwoOutputs" input:"A:1" }
+      )EOF",
+      &bad_def);
+  ASSERT_TRUE(parsed);
+  EXPECT_IMPORT_FAILURE(bad_def, opts,
+                        "Node 'B': Connecting to invalid output 1 of source "
+                        "node A which has 1 outputs");
+
+  parsed = protobuf::TextFormat::ParseFromString(
+      R"EOF(
+      node{ name:"A" op:"TestParams" }
+      node{ name:"B" op:"TestParams" }
+      node{ name:"C" op:"TestOneInputTwoOutputs" input:"A" input:"B" }
+      )EOF",
+      &bad_def);
+  ASSERT_TRUE(parsed);
+  EXPECT_IMPORT_FAILURE(bad_def, opts, "do not match 2 inputs specified");
+
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "node{ name:'A' op:'TestOneInputTwoOutputs' }", &bad_def));
+  EXPECT_IMPORT_FAILURE(bad_def, opts, "do not match 0 inputs specified");
+
+  parsed = protobuf::TextFormat::ParseFromString(
+      R"EOF(
+      node{
+        name:"A"
+        op:"TestParams"
+        attr{
+          key:"_class"
+          value{ list{ s:"loc:@B" } }
+        }
+      })EOF",
+      &bad_def);
+  ASSERT_TRUE(parsed);
+  EXPECT_IMPORT_FAILURE(
+      bad_def, opts, "Node 'A' expects to be colocated with unknown node 'B'");
+
+  opts.prefix = "";
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "node{name:'scope/A' op:'TestParams'}", &bad_def));
+  EXPECT_IMPORT_FAILURE(bad_def, opts,
+                        "Node 'scope/A' already exists in the Graph");
+
+  parsed = protobuf::TextFormat::ParseFromString(
+      R"EOF(
+      node { name: "A" op: "TestParams" }
+      node { name: "B" op: "L2Loss"
+             input: "A:0"
+             attr { key: "T" value { type: DT_FLOAT } }
+             attr { key: "_output_shapes"
+                    value { list { shape { dim { size: 43 } } } } } }
+      )EOF",
+      &bad_def);
+  ASSERT_TRUE(parsed);
+  EXPECT_IMPORT_FAILURE(bad_def, opts,
+                        "Node 'B' has an _output_shapes attribute inconsistent "
+                        "with the GraphDef for output #0");
+#undef EXPECT_IMPORT_FAILURE
 }
 
 TEST_F(GraphConstructorTest, CopyGraph) {
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 0e568869406..27d89295958 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -74,7 +74,7 @@ NodeBuilder& NodeBuilder::Input(gtl::ArraySlice<NodeOut> src_list) {
       inputs_.emplace_back(node_out.node, node_out.index);
     }
   }
-  def_builder_.Input(srcs);
+  def_builder_.Input(gtl::ArraySlice<NodeDefBuilder::NodeOut>(srcs));
   return *this;
 }
 
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index df7b7df4a53..e3f6504ff95 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
diff --git a/tensorflow/core/graph/tensor_id_test.cc b/tensorflow/core/graph/tensor_id_test.cc
index b1def86b1a0..143606db032 100644
--- a/tensorflow/core/graph/tensor_id_test.cc
+++ b/tensorflow/core/graph/tensor_id_test.cc
@@ -23,10 +23,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static string ParseHelper(const string& n) {
-  TensorId id = ParseTensorName(n);
-  return strings::StrCat(id.first, ":", id.second);
-}
+string ParseHelper(const string& n) { return ParseTensorName(n).ToString(); }
 
 TEST(TensorIdTest, ParseTensorName) {
   EXPECT_EQ(ParseHelper("W1"), "W1:0");
@@ -36,12 +33,12 @@ TEST(TensorIdTest, ParseTensorName) {
   EXPECT_EQ(ParseHelper("xyz1_17"), "xyz1_17:0");
 }
 
-static uint32 Skewed(random::SimplePhilox* rnd, int max_log) {
+uint32 Skewed(random::SimplePhilox* rnd, int max_log) {
   const uint32 space = 1 << (rnd->Rand32() % (max_log + 1));
   return rnd->Rand32() % space;
 }
 
-static void BM_ParseTensorName(int iters, int arg) {
+void BM_ParseTensorName(int iters, int arg) {
   testing::StopTiming();
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 408a74ded0c..750ea702043 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -15,6 +15,11 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
+package_group(
+    name = "friends",
+    packages = ["//tensorflow/..."],
+)
+
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -406,15 +411,15 @@ tf_kernel_libraries(
         "where_op",
     ],
     deps = [
-        ":batchtospace_op",
+        ":batch_space_ops",
         ":bounds_check",
         ":concat_lib",
         ":cuda_device_array",
         ":depth_space_ops",
         ":extract_image_patches_op",
         ":fill_functor",
+        ":gather_op_cpu_impl",
         ":ops_util",
-        ":spacetobatch_op",
         ":split_lib",
         ":strided_slice_op",
         ":transpose_functor",
@@ -432,6 +437,18 @@ tf_kernel_libraries(
     ],
 )
 
+cc_library(
+    name = "gather_op_cpu_impl",
+    hdrs = ["gather_op_cpu_impl.h"],
+    visibility = [":friends"],
+    deps = [
+        ":bounds_check",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_test(
     name = "batch_norm_op_test",
     size = "small",
@@ -1582,22 +1599,39 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "batchtospace_op",
-    prefix = "batchtospace_op",
+    name = "batch_space_ops",
+    srcs = [
+        "batchtospace_op.cc",
+        "spacetobatch_functor.cc",
+        "spacetobatch_functor.h",
+        "spacetobatch_op.cc",
+    ],
+    gpu_srcs = [
+        "spacetobatch_functor.h",
+        "spacetobatch_functor_gpu.cu.cc",
+    ],
+    visibility = ["//visibility:private"],
     deps = [
+        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
 
-tf_kernel_library(
-    name = "spacetobatch_op",
-    prefix = "spacetobatch_op",
+tf_cuda_cc_test(
+    name = "spacetobatch_benchmark_test",
+    srcs = ["spacetobatch_benchmark_test.cc"],
     deps = [
+        ":batch_space_ops",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -1946,6 +1980,7 @@ filegroup(
         "function_ops.cc",
         "gather_op.cc",
         "gather_op.h",
+        "gather_op_cpu_impl.h",
         "identity_op.cc",
         "identity_op.h",
         "immutable_constant_op.cc",
diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc
index e17fece5dbb..8a2c5e21ac5 100644
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/core/kernels/batchtospace_op.h"
+#include "tensorflow/core/kernels/spacetobatch_functor.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
@@ -39,6 +39,177 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+template <typename Device, typename T>
+static void BatchToSpaceOpCompute(OpKernelContext* context,
+                                  const Tensor& orig_input_tensor,
+                                  const Tensor& orig_block_shape,
+                                  const Tensor& orig_crops) {
+  const int input_dims = orig_input_tensor.dims();
+  OP_REQUIRES(
+      context, TensorShapeUtils::IsVector(orig_block_shape.shape()),
+      errors::InvalidArgument("block_shape rank should be 1 instead of ",
+                              orig_block_shape.dims()));
+
+  const int block_dims = orig_block_shape.dim_size(0);
+  OP_REQUIRES(
+      context, orig_input_tensor.dims() >= 1 + block_dims,
+      errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
+                              " instead of ", orig_input_tensor.dims()));
+
+  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
+                           block_dims == orig_crops.dim_size(0) &&
+                           2 == orig_crops.dim_size(1),
+              errors::InvalidArgument("crops should have shape [", block_dims,
+                                      ", 2] instead of ",
+                                      orig_crops.shape().DebugString()));
+  // To avoid out-of-bounds access in the case that the block_shape and/or
+  // crops tensors are concurrently modified, we must copy the values.
+  gtl::InlinedVector<int64, 4> block_shape;
+  gtl::InlinedVector<int64, 8> crops;
+  internal::spacetobatch::SubtleMustCopyFlat(orig_block_shape, &block_shape);
+  internal::spacetobatch::SubtleMustCopyFlat(orig_crops, &crops);
+
+  // Determine the length of the prefix of block dims that can be combined
+  // into the batch dimension due to having no padding and block_shape=1.
+  int removed_prefix_block_dims = 0;
+  for (; removed_prefix_block_dims < block_dims; ++removed_prefix_block_dims) {
+    const int dim = removed_prefix_block_dims;
+    if (crops[2 * dim] != 0 || crops[2 * dim + 1] != 0 ||
+        block_shape[dim] != 1) {
+      break;
+    }
+  }
+
+  // Determine the length of the suffix of block dims that can be combined
+  // into the depth dimension due to having no padding and block_shape=1.
+  int removed_suffix_block_dims = 0;
+  for (; removed_suffix_block_dims < block_dims - removed_prefix_block_dims;
+       ++removed_suffix_block_dims) {
+    const int dim = block_dims - 1 - removed_suffix_block_dims;
+    if (crops[2 * dim] != 0 || crops[2 * dim + 1] != 0 ||
+        block_shape[dim] != 1) {
+      break;
+    }
+  }
+
+  // Compute the product of the block_shape values.
+  int64 block_shape_product = 1;
+  for (int block_dim = 0; block_dim < block_dims; ++block_dim) {
+    block_shape_product *= block_shape[block_dim];
+  }
+
+  const int64 orig_input_batch_size = orig_input_tensor.dim_size(0);
+  OP_REQUIRES(
+      context, orig_input_batch_size % block_shape_product == 0,
+      errors::InvalidArgument("Input batch dimension (", orig_input_batch_size,
+                              ") is not divisible by product of block sizes (",
+                              block_shape_product, ")"));
+
+  const int internal_block_dims =
+      block_dims - removed_prefix_block_dims - removed_suffix_block_dims;
+  OP_REQUIRES(context, internal_block_dims <= kMaxSpaceToBatchBlockDims,
+              errors::InvalidArgument(
+                  "Maximum number of non-combined block dimensions is ",
+                  internal_block_dims, " but must not exceed ",
+                  kMaxSpaceToBatchBlockDims));
+
+  if (internal_block_dims == 0) {
+    context->set_output(0, orig_input_tensor);
+    return;
+  }
+
+  // For the purpose of computing the result, the input will be treated as
+  // having this shape, of rank 2 + internal_block_dims.
+  TensorShape internal_input_shape;
+
+  // For the purpose of computing the result, the output will be treated as
+  // having this shape, of rank 2 + internal_block_dims.
+  TensorShape internal_output_shape;
+
+  // The actual output shape exposed to callers.
+  TensorShape external_output_shape;
+
+  external_output_shape.AddDim(orig_input_batch_size / block_shape_product);
+
+  int64 input_batch_size = orig_input_batch_size;
+  for (int block_dim = 0; block_dim < removed_prefix_block_dims; ++block_dim) {
+    const int64 size = orig_input_tensor.dim_size(block_dim + 1);
+    input_batch_size *= size;
+    external_output_shape.AddDim(size);
+  }
+  internal_input_shape.AddDim(input_batch_size);
+  internal_output_shape.AddDim(input_batch_size / block_shape_product);
+
+  for (int block_dim = removed_prefix_block_dims;
+       block_dim < block_dims - removed_suffix_block_dims; ++block_dim) {
+    const int64 crop_start = crops[2 * block_dim],
+                crop_end = crops[2 * block_dim + 1];
+    OP_REQUIRES(context, crop_start >= 0 && crop_end >= 0,
+                errors::InvalidArgument("Crops must be non-negative"));
+    const int64 input_size = orig_input_tensor.dim_size(block_dim + 1);
+    const int64 block_shape_value = block_shape[block_dim];
+    const int64 cropped_size =
+        input_size * block_shape_value - crop_start - crop_end;
+    OP_REQUIRES(context, cropped_size >= 0,
+                errors::InvalidArgument("cropped_shape[", block_dim, "]=",
+                                        cropped_size, " must be non-negative"));
+    internal_input_shape.AddDim(input_size);
+    internal_output_shape.AddDim(cropped_size);
+    external_output_shape.AddDim(cropped_size);
+  }
+
+  int64 depth = 1;
+  for (int dim = block_dims - removed_suffix_block_dims + 1; dim < input_dims;
+       ++dim) {
+    const int64 size = orig_input_tensor.dim_size(dim);
+    external_output_shape.AddDim(size);
+    depth *= size;
+  }
+  internal_input_shape.AddDim(depth);
+  internal_output_shape.AddDim(depth);
+
+  // Allocate output tensor.
+  Tensor* output_tensor = nullptr;
+  OP_REQUIRES_OK(context, context->allocate_output(0, external_output_shape,
+                                                   &output_tensor));
+
+  const int64* internal_crops = &crops[2 * removed_prefix_block_dims];
+  const int64* internal_block_shape = &block_shape[removed_prefix_block_dims];
+
+  switch (internal_block_dims) {
+#define TF_BATCHTOSPACE_BLOCK_DIMS_CASE(NUM_BLOCK_DIMS)                   \
+  case NUM_BLOCK_DIMS: {                                                  \
+    OP_REQUIRES_OK(                                                       \
+        context,                                                          \
+        (functor::SpaceToBatchFunctor<Device, T, NUM_BLOCK_DIMS, true>()( \
+            context->eigen_device<Device>(),                              \
+            output_tensor->shaped<T, NUM_BLOCK_DIMS + 2>(                 \
+                internal_output_shape.dim_sizes()),                       \
+            internal_block_shape, internal_crops,                         \
+            orig_input_tensor.shaped<T, NUM_BLOCK_DIMS + 2>(              \
+                internal_input_shape.dim_sizes()))));                     \
+  } break;                                                                \
+    /**/
+    TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(TF_BATCHTOSPACE_BLOCK_DIMS_CASE)
+#undef TF_BATCHTOSPACE_BLOCK_DIMS_CASE
+  }
+}
+
+template <typename Device, typename T>
+class BatchToSpaceNDOp : public OpKernel {
+ public:
+  explicit BatchToSpaceNDOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& orig_input_tensor = context->input(0);
+    const Tensor& orig_block_shape = context->input(1);
+    const Tensor& orig_crops = context->input(2);
+    BatchToSpaceOpCompute<Device, T>(context, orig_input_tensor,
+                                     orig_block_shape, orig_crops);
+  }
+};
+
 template <typename Device, typename T>
 class BatchToSpaceOp : public OpKernel {
  public:
@@ -47,6 +218,12 @@ class BatchToSpaceOp : public OpKernel {
     OP_REQUIRES(
         context, block_size_ > 1,
         errors::InvalidArgument("Block size should be > 1: ", block_size_));
+    // We don't use context->allocate_persistent because the allocation must
+    // happen on the CPU regardless of Device.
+    block_shape_ = Tensor(tensorflow::DT_INT64, TensorShape({2}));
+    auto block_shape_vec = block_shape_.vec<int64>();
+    block_shape_vec(0) = block_size_;
+    block_shape_vec(1) = block_size_;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -60,118 +237,42 @@ class BatchToSpaceOp : public OpKernel {
     OP_REQUIRES(context, kRequiredDims == dims,
                 errors::InvalidArgument("Input rank should be: ", kRequiredDims,
                                         "instead of: ", dims));
-
-    // The crops is presumed to be [2, 2] and contain non-negative values.
-    OP_REQUIRES(
-        context,
-        TensorShapeUtils::IsMatrix(in1.shape()) &&
-        in1.dim_size(0) == 2 && in1.dim_size(1) == 2,
-        errors::InvalidArgument("crops must be a 2 x 2 matrix: ",
-                                in1.shape().DebugString()));
-    TTypes<int32>::ConstMatrix crops = in1.matrix<int32>();
-    OP_REQUIRES(context,
-                crops(0, 0) >= 0 && crops(0, 1) >= 0 &&
-                crops(1, 0) >= 0 && crops(1, 1) >= 0,
-                errors::InvalidArgument("Crops must be non-negative"));
-
-    const int input_batch = in0.dim_size(0);
-    const int input_height = in0.dim_size(1);
-    const int input_width = in0.dim_size(2);
-    const int depth = in0.dim_size(3);
-
-    const int block_size_sq = block_size_ * block_size_;
-
-    // The batch must be divisible by block_size_ * block_size_
-    OP_REQUIRES(
-        context, input_batch % block_size_sq == 0,
-        errors::InvalidArgument("Input batch dimension ", input_batch,
-                                "should be divisible by: ", block_size_sq));
-
-
-    const int output_batch = input_batch / block_size_sq;
-    const int output_height =
-        input_height * block_size_ - crops(0, 0) - crops(0, 1);
-    const int output_width =
-        input_width * block_size_ - crops(1, 0) - crops(1, 1);
-    OP_REQUIRES(context, output_height > 0 && output_width > 0,
-                errors::InvalidArgument("Output dimensions must be positive"));
-
-    // Allocate output tensor.
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({output_batch, output_height,
-                                                output_width, depth}),
-                                &output));
-
-    typename TTypes<T, 4>::ConstTensor Tinput = in0.tensor<T, 4>();
-    typename TTypes<T, 4>::Tensor Toutput = output->tensor<T, 4>();
-
-    functor::BatchToSpaceOpFunctor<Device, T> functor;
-    functor(context->eigen_device<Device>(),
-            Tinput, crops, block_size_, Toutput);
-  };
+    BatchToSpaceOpCompute<Device, T>(context, in0, block_shape_, in1);
+  }
 
  private:
   int block_size_;
+  Tensor block_shape_;
 };
 
-// Partial specialization of BatchToSpaceOpFunctor for a CPUDevice.
-namespace functor {
-template <typename T>
-struct BatchToSpaceOpFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
-                  typename TTypes<int32>::ConstMatrix crops,
-                  int block_size, typename TTypes<T, 4>::Tensor output) {
-    const int input_batch = input.dimension(0);
-    const int input_height = input.dimension(1);
-    const int input_width = input.dimension(2);
-    const int depth = input.dimension(3);
-
-    const int output_batch = output.dimension(0);
-    const int output_height = output.dimension(1);
-    const int output_width = output.dimension(2);
-
-    const int crop_top = crops(0, 0);
-    const int crop_left = crops(1, 0);
-
-    for (int in_b = 0; in_b < input_batch; ++in_b) {
-      // in_b = (offset_h * block_size + offset_w) * output_batch + out_b
-      const int out_b = in_b % output_batch;
-      const int offset_w = (in_b / output_batch) % block_size;
-      const int offset_h = (in_b / output_batch) / block_size;
-      for (int in_h = 0; in_h < input_height; ++in_h) {
-        const int out_h = in_h * block_size + offset_h - crop_top;
-        for (int in_w = 0; in_w < input_width; ++in_w) {
-          const int out_w = in_w * block_size + offset_w - crop_left;
-          if (out_h >= 0 && out_w >= 0 &&
-              out_h < output_height && out_w < output_width) {
-            for (int d = 0; d < depth; ++d) {
-              output(out_b, out_h, out_w, d) = input(in_b, in_h, in_w, d);
-            }
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace functor
-
-#define REGISTER(T)                                                     \
-  REGISTER_KERNEL_BUILDER(Name("BatchToSpace")                          \
-                              .Device(DEVICE_CPU)                       \
-                              .TypeConstraint<T>("T")                   \
-                              .HostMemory("crops"),                     \
+#define REGISTER(T)                                        \
+  REGISTER_KERNEL_BUILDER(Name("BatchToSpaceND")           \
+                              .Device(DEVICE_CPU)          \
+                              .TypeConstraint<T>("T")      \
+                              .HostMemory("block_shape")   \
+                              .HostMemory("crops"),        \
+                          BatchToSpaceNDOp<CPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("BatchToSpace")             \
+                              .Device(DEVICE_CPU)          \
+                              .TypeConstraint<T>("T")      \
+                              .HostMemory("crops"),        \
                           BatchToSpaceOp<CPUDevice, T>);
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(T)                                                     \
-  REGISTER_KERNEL_BUILDER(Name("BatchToSpace")                          \
-                              .Device(DEVICE_GPU)                       \
-                              .TypeConstraint<T>("T")                   \
-                              .HostMemory("crops"),                     \
+#define REGISTER(T)                                        \
+  REGISTER_KERNEL_BUILDER(Name("BatchToSpaceND")           \
+                              .Device(DEVICE_GPU)          \
+                              .TypeConstraint<T>("T")      \
+                              .HostMemory("block_shape")   \
+                              .HostMemory("crops"),        \
+                          BatchToSpaceNDOp<GPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("BatchToSpace")             \
+                              .Device(DEVICE_GPU)          \
+                              .TypeConstraint<T>("T")      \
+                              .HostMemory("crops"),        \
                           BatchToSpaceOp<GPUDevice, T>);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER);
diff --git a/tensorflow/core/kernels/batchtospace_op.h b/tensorflow/core/kernels/batchtospace_op.h
deleted file mode 100644
index d06af811b88..00000000000
--- a/tensorflow/core/kernels/batchtospace_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_KERNELS_BATCHTOSPACE_OP_H_
-#define TENSORFLOW_CORE_KERNELS_BATCHTOSPACE_OP_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace functor {
-
-// Functor used by BatchToSpaceOp to do the computations.
-template <typename Device, typename T>
-struct BatchToSpaceOpFunctor {
-  // Implements the batch to space conversion.
-  //
-  // input: 4-D input tensor.
-  // crops: [2, 2] matrix, [[crop_top, crop_bottom], [crop_left, crop_right]],
-  //   specifying how many elements to discard (un-pad) from the intermediate
-  //   result obtained after rearranging the batch data into spatial blocks.
-  // block_size: block size for the conversion.
-  // output: 4-D output tensor.
-  //
-  // The dimensions of the tensors are guaranteed to be correct when the
-  // functor is called.
-  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
-                  typename TTypes<int32>::ConstMatrix crops,
-                  int block_size, typename TTypes<T, 4>::Tensor output);
-};
-
-}  // namespace functor
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_BATCHTOSPACE_OP_H_
diff --git a/tensorflow/core/kernels/batchtospace_op_gpu.cu.cc b/tensorflow/core/kernels/batchtospace_op_gpu.cu.cc
deleted file mode 100644
index 283055aab4f..00000000000
--- a/tensorflow/core/kernels/batchtospace_op_gpu.cu.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-
-#define EIGEN_USE_GPU
-
-#include "tensorflow/core/kernels/batchtospace_op.h"
-
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
-
-namespace tensorflow {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-template <typename T>
-__global__ void B2S(
-    const int32 nthreads, const T* input_ptr,
-    const int block_size, const int crop_top, const int crop_left,
-    const int input_batch, const int input_height, const int input_width,
-    const int depth, const int output_batch, const int output_height,
-    const int output_width, T* output_ptr) {
-  CUDA_1D_KERNEL_LOOP(inp_idx, nthreads) {
-    // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
-    const int d = inp_idx % depth;
-    const int inp_idx2 = inp_idx / depth;
-    const int w = inp_idx2 % input_width;
-    const int inp_idx3 = inp_idx2 / input_width;
-    const int h = inp_idx3 % input_height;
-    const int b = inp_idx3 / input_height;
-
-    const int out_b = b % output_batch;
-    const int offset_w = (b / output_batch) % block_size;
-    const int offset_h = (b / output_batch) / block_size;
-    const int out_h = h * block_size + offset_h - crop_top;
-    const int out_w = w * block_size + offset_w - crop_left;
-
-    if (out_h >= 0 && out_w >= 0 &&
-        out_h < output_height && out_w < output_width) {
-      const int out_idx =
-          d + depth * (out_w + output_width * (out_h + output_height * out_b));
-      output_ptr[out_idx] = ldg(input_ptr + inp_idx);
-    }
-  }
-}
-
-// Specialization of BatchToSpaceOpFunctor for a GPUDevice.
-namespace functor {
-template <typename T>
-struct BatchToSpaceOpFunctor<GPUDevice, T> {
-  void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
-                  typename TTypes<int32>::ConstMatrix crops,
-                  int block_size, typename TTypes<T, 4>::Tensor output) {
-    const int input_batch = input.dimension(0);
-    const int input_height = input.dimension(1);
-    const int input_width = input.dimension(2);
-    const int depth = input.dimension(3);
-
-    const int output_batch = output.dimension(0);
-    const int output_height = output.dimension(1);
-    const int output_width = output.dimension(2);
-
-    const int crop_top = crops(0, 0);
-    const int crop_left = crops(1, 0);
-
-    const int total_count =
-        input_batch * input_height * input_width * depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    B2S<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input.data(), block_size, crop_top,
-        crop_left, input_batch, input_height, input_width, depth,
-        output_batch, output_height, output_width, output.data());
-  }
-};
-}  // end namespace functor
-
-// Instantiate the GPU implementation.
-template struct functor::BatchToSpaceOpFunctor<GPUDevice, float>;
-template struct functor::BatchToSpaceOpFunctor<GPUDevice, double>;
-
-}  // end namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 36831a42d0c..f12527364b2 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -32,7 +32,9 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+#if GOOGLE_CUDA
 typedef Eigen::GpuDevice GPUDevice;
+#endif  // GOOGLE_CUDA
 
 // --------------------------------------------------------------------------
 template <typename Device, typename T>
@@ -119,6 +121,7 @@ class ConcatOp : public OpKernel {
     if (output->NumElements() > 0) {
       int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
       auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
+#if GOOGLE_CUDA
       if (std::is_same<Device, GPUDevice>::value) {
         // Switching indexing to int64 might cause performance issues.
         // Hence, we keep int32 indexing in the GPU kernel unless we need to
@@ -128,9 +131,10 @@ class ConcatOp : public OpKernel {
         } else {
           ConcatGPU64<T>(c->eigen_gpu_device(), inputs_flat, &output_flat);
         }
-      } else {
-        ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+        return;
       }
+#endif  // GOOGLE_CUDA
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
 };
diff --git a/tensorflow/core/kernels/cwise_op_digamma.cc b/tensorflow/core/kernels/cwise_op_digamma.cc
index 0f4bb9aedb1..8b7b5d4b6aa 100644
--- a/tensorflow/core/kernels/cwise_op_digamma.cc
+++ b/tensorflow/core/kernels/cwise_op_digamma.cc
@@ -16,10 +16,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
+#if EIGEN_HAS_C99_MATH
 REGISTER3(UnaryOp, CPU, "Digamma", functor::digamma, float, Eigen::half,
           double);
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Digamma", functor::digamma, float, Eigen::half,
           double);
-#endif
+#endif  // GOOGLE_CUDA
+#endif  // EIGEN_HAS_C99_MATH
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_erf.cc b/tensorflow/core/kernels/cwise_op_erf.cc
index 16a394f0524..5095285fbc4 100644
--- a/tensorflow/core/kernels/cwise_op_erf.cc
+++ b/tensorflow/core/kernels/cwise_op_erf.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
+#if EIGEN_HAS_C99_MATH
 REGISTER3(UnaryOp, CPU, "Erf", functor::erf, float, Eigen::half, double);
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Erf", functor::erf, float, Eigen::half, double);
-#endif
+#endif  // GOOGLE_CUDA
+#endif  // EIGEN_HAS_C99_MATH
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_erfc.cc b/tensorflow/core/kernels/cwise_op_erfc.cc
index e11cb95de43..ffc401352b9 100644
--- a/tensorflow/core/kernels/cwise_op_erfc.cc
+++ b/tensorflow/core/kernels/cwise_op_erfc.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
+#if EIGEN_HAS_C99_MATH
 REGISTER3(UnaryOp, CPU, "Erfc", functor::erfc, float, Eigen::half, double);
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Erfc", functor::erfc, float, Eigen::half, double);
-#endif
+#endif  // GOOGLE_CUDA
+#endif  // EIGEN_HAS_C99_MATH
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_igammas.cc b/tensorflow/core/kernels/cwise_op_igammas.cc
index a1d7f4dad43..b1ea921ffdf 100644
--- a/tensorflow/core/kernels/cwise_op_igammas.cc
+++ b/tensorflow/core/kernels/cwise_op_igammas.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
+#if EIGEN_HAS_C99_MATH
 REGISTER2(BinaryOp, CPU, "Igamma", functor::igamma, float, double);
 REGISTER2(BinaryOp, CPU, "Igammac", functor::igammac, float, double);
+#endif  // EIGEN_HAS_C99_MATH
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_zeta.cc b/tensorflow/core/kernels/cwise_op_zeta.cc
index 2c5538534cc..6ccb54b6805 100644
--- a/tensorflow/core/kernels/cwise_op_zeta.cc
+++ b/tensorflow/core/kernels/cwise_op_zeta.cc
@@ -17,5 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER2(BinaryOp, CPU, "Zeta", functor::zeta, float, double);
+#if EIGEN_HAS_C99_MATH
 REGISTER2(BinaryOp, CPU, "Polygamma", functor::polygamma, float, double);
+#endif  // EIGEN_HAS_C99_MATH
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index 4e37f585fb2..79bb223ccbe 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -45,20 +45,6 @@ void BinaryOpShared::SetComputeError(OpKernelContext* ctx) {
   }
 }
 
-static BCast::Vec FromShape(const TensorShape& shape) {
-  const int N = shape.dims();
-  BCast::Vec ret(N);
-  for (int i = 0; i < N; ++i) {
-    ret[i] = shape.dim_size(i);
-  }
-  return ret;
-}
-
-static TensorShape ToShape(const BCast::Vec& vec) {
-  TensorShape shape(vec);
-  return shape;
-}
-
 BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
     : in0(ctx->input(0)),
       in1(ctx->input(1)),
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 95c0e49b9c8..d916ca49960 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -38,13 +38,6 @@ class CopyOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& src_tensor = context->input(0);
 
-    DeviceContext* device_ctxt = context->op_device_context();
-    Device* device = static_cast<Device*>(context->device());
-
-    // Determine if the input tensor is not on CPU (e.g., on GPU).
-    bool off_host_input = device->device_type() == DEVICE_GPU &&
-                          !context->input_alloc_attr(0).on_host();
-
     if (src_tensor.IsInitialized()) {
       // Source tensor is initialized. Make a copy.
       Tensor* copied_tensor;
@@ -52,7 +45,13 @@ class CopyOp : public OpKernel {
                                                        &copied_tensor));
 
 #if GOOGLE_CUDA
+      Device* device = static_cast<Device*>(context->device());
+      // Determine if the input tensor is not on CPU (e.g., on GPU).
+      bool off_host_input = device->device_type() == DEVICE_GPU &&
+                            !context->input_alloc_attr(0).on_host();
+
       if (off_host_input) {
+        DeviceContext* device_ctxt = context->op_device_context();
         // Input is not on host: deep-copy it from GPU to the same GPU.
         Notification done_copy;
         GPUUtil::CopyGPUTensorToSameGPU(
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 6338638a3b8..9a55ad8698f 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/parsing_ops.cc.
 
+#include <numeric>
 #include <unordered_set>
-
 #include <vector>
 
 #include "tensorflow/core/example/example.pb.h"
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index a983d9362cc..9bba6712a22 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -235,8 +235,6 @@ class FractionalAvgPoolGradOp : public OpKernel {
     // tensor of double type. And cast it to the corresponding type.
     typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
         ConstEigenMatrixMap;
-    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
-        EigenMatrixMap;
     typedef Eigen::Map<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>>
         EigenDoubleMatrixMap;
 
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index edce9a31978..179931fd4ec 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/gather_op_cpu_impl.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
@@ -92,81 +93,13 @@ class GatherOp : public OpKernel {
 
 namespace functor {
 
-// Helper method to copy using memcpy.
-template <typename T, typename Index, typename SliceIndex,
-          SliceIndex static_slice_elems>
-SliceIndex HandleCopies(typename TTypes<T>::ConstMatrix params,
-                        typename TTypes<Index>::ConstFlat indices,
-                        SliceIndex slice_elems,
-                        typename TTypes<T>::Matrix out) {
-  const SliceIndex first_dim_size =
-      static_cast<SliceIndex>(indices.dimension(0));
-  const Index limit = static_cast<Index>(params.dimension(0));
-  T* out_base = &out(0, 0);
-  const T* params_base = &params(0, 0);
-  if (static_slice_elems >= 0) {
-    // Give compiler static knowledge of the number of elements/bytes
-    CHECK_EQ(static_slice_elems, slice_elems);
-    slice_elems = static_slice_elems;
-  }
-  // Compute slice_bytes here so that static knowledge is available
-  const size_t slice_bytes = slice_elems * sizeof(T);
-  for (SliceIndex i = 0; i < first_dim_size; i++) {
-    const SliceIndex j = i + 1;
-    if (j < first_dim_size) {
-      port::prefetch<port::PREFETCH_HINT_T0>(&params(indices(j), 0));
-      port::prefetch<port::PREFETCH_HINT_T0>(&out(j, 0));
-    }
-    // Grab the index and check its validity.  An earlier version of the
-    // code checked it and then grabbed it from memory a second time, which
-    // was a security risk since it could have changed in between.
-    const Index index = internal::SubtleMustCopy(indices(i));
-    if (!FastBoundsCheck(index, limit)) return i;
-    // Copy using memcpy if possible, otherwise an Eigen loop
-    if (Allocator::is_simple<T>::value) {
-      memcpy(out_base + i * slice_elems, params_base + index * slice_elems,
-             slice_bytes);
-    } else {
-      out.template chip<0>(i) = params.template chip<0>(index);
-    }
-  }
-  return -1;
-}
-
 // Specialization gather functor for CPU.
 template <typename T, typename Index>
 struct Gather<CPUDevice, T, Index> {
   int64 operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix params,
                    typename TTypes<Index>::ConstFlat indices,
                    typename TTypes<T>::Matrix out) {
-    const int64 N = indices.size();
-    const int64 slice_size = out.size() / N;
-    int64 bad_i;
-
-    bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
-                      params.size() > std::numeric_limits<int32>::max() ||
-                      N > std::numeric_limits<int32>::max());
-#define CALL(elems)                                                   \
-  do {                                                                \
-    if (use_large) {                                                  \
-      bad_i = HandleCopies<T, Index, int64, elems>(params, indices,   \
-                                                   slice_size, out);  \
-    } else {                                                          \
-      const int32 small_slice = static_cast<int32>(slice_size);       \
-      bad_i = HandleCopies<T, Index, int32, elems>(params, indices,   \
-                                                   small_slice, out); \
-    }                                                                 \
-  } while (0)
-
-    if (slice_size == 10)
-      CALL(10);
-    else if (slice_size == 20)
-      CALL(20);
-    else
-      CALL(-1);
-#undef CALL
-
-    return bad_i;
+    return GatherCpu<T, Index>()(params, indices, out);
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/gather_op_cpu_impl.h b/tensorflow/core/kernels/gather_op_cpu_impl.h
new file mode 100644
index 00000000000..49fff679c41
--- /dev/null
+++ b/tensorflow/core/kernels/gather_op_cpu_impl.h
@@ -0,0 +1,114 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_GATHER_OP_CPU_IMPL_H_
+#define TENSORFLOW_KERNELS_GATHER_OP_CPU_IMPL_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+namespace functor {
+
+// Helper method to copy using memcpy.
+template <typename T, typename Index, typename SliceIndex,
+          SliceIndex static_slice_elems>
+SliceIndex HandleCopies(typename TTypes<T>::ConstMatrix params,
+                        typename TTypes<Index>::ConstFlat indices,
+                        SliceIndex slice_elems,
+                        typename TTypes<T>::Matrix out) {
+  const SliceIndex first_dim_size =
+      static_cast<SliceIndex>(indices.dimension(0));
+  const Index limit = static_cast<Index>(params.dimension(0));
+  T* out_base = &out(0, 0);
+  const T* params_base = &params(0, 0);
+  if (static_slice_elems >= 0) {
+    // Give compiler static knowledge of the number of elements/bytes
+    CHECK_EQ(static_slice_elems, slice_elems);
+    slice_elems = static_slice_elems;
+  }
+  // Compute slice_bytes here so that static knowledge is available
+  const size_t slice_bytes = slice_elems * sizeof(T);
+  for (SliceIndex i = 0; i < first_dim_size; i++) {
+    const SliceIndex j = i + 1;
+    if (j < first_dim_size) {
+      port::prefetch<port::PREFETCH_HINT_T0>(&params(indices(j), 0));
+      port::prefetch<port::PREFETCH_HINT_T0>(&out(j, 0));
+    }
+    // Grab the index and check its validity.  An earlier version of the
+    // code checked it and then grabbed it from memory a second time, which
+    // was a security risk since it could have changed in between.
+    const Index index = internal::SubtleMustCopy(indices(i));
+    if (!FastBoundsCheck(index, limit)) return i;
+    // Copy using memcpy if possible, otherwise an Eigen loop
+    // TODO(cwhipkey): avoid linking to framework to get Allocator (to improve
+    // ahead-of-time compilation binary size).
+    if (Allocator::is_simple<T>::value) {
+      memcpy(out_base + i * slice_elems, params_base + index * slice_elems,
+             slice_bytes);
+    } else {
+      out.template chip<0>(i) = params.template chip<0>(index);
+    }
+  }
+  return -1;
+}
+
+template <typename T, typename Index>
+struct GatherCpu {
+  int64 operator()(typename TTypes<T>::ConstMatrix params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<T>::Matrix out) {
+    const int64 N = indices.size();
+    const int64 slice_size = out.size() / N;
+    int64 bad_i;
+
+    bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
+                      params.size() > std::numeric_limits<int32>::max() ||
+                      N > std::numeric_limits<int32>::max());
+#define CALL(elems)                                                   \
+  do {                                                                \
+    if (use_large) {                                                  \
+      bad_i = HandleCopies<T, Index, int64, elems>(params, indices,   \
+                                                   slice_size, out);  \
+    } else {                                                          \
+      const int32 small_slice = static_cast<int32>(slice_size);       \
+      bad_i = HandleCopies<T, Index, int32, elems>(params, indices,   \
+                                                   small_slice, out); \
+    }                                                                 \
+  } while (0)
+
+    if (slice_size == 10)
+      CALL(10);
+    else if (slice_size == 20)
+      CALL(20);
+    else
+      CALL(-1);
+#undef CALL
+
+    return bad_i;
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_GATHER_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/immutable_constant_op_test.cc b/tensorflow/core/kernels/immutable_constant_op_test.cc
index 033337ea817..93d726a64d4 100644
--- a/tensorflow/core/kernels/immutable_constant_op_test.cc
+++ b/tensorflow/core/kernels/immutable_constant_op_test.cc
@@ -63,18 +63,16 @@ class TestFileSystem : public NullFileSystem {
       const string& fname,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     float val = 0;
+    StringPiece scheme, host, path;
+    ParseURI(fname, &scheme, &host, &path);
     // For the tests create in-memory regions with float values equal to the
-    // first letter of the region name.
-    switch (GetNameFromURI(fname).front()) {
-      case '2':
-        val = 2.0f;
-        break;
-      case '3':
-        val = 3.0f;
-        break;
-      default:
-        val = 0.0f;
-        break;
+    // region name.
+    if (path == "/2") {
+      val = 2.0f;
+    } else if (path == "/3") {
+      val = 3.0f;
+    } else {
+      val = 0.0f;
     }
 
     auto region = new TestReadOnlyMemoryRegion(kTestTensorSizeBytes);
@@ -93,9 +91,9 @@ TEST(ImmutableConstantOpTest, Simple) {
   const TensorShape kTestTensorShapeT({1, 4});
   auto root = Scope::NewRootScope().ExitOnError();
   auto node1 =
-      ops::ImmutableConst(root, DT_FLOAT, kTestTensorShape, "test://2");
+      ops::ImmutableConst(root, DT_FLOAT, kTestTensorShape, "test:///2");
   auto node2 =
-      ops::ImmutableConst(root, DT_FLOAT, kTestTensorShapeT, "test://3");
+      ops::ImmutableConst(root, DT_FLOAT, kTestTensorShapeT, "test:///3");
   auto result = ops::MatMul(root, node1, node2);
   GraphDef graph_def;
   TF_ASSERT_OK(root.ToGraphDef(&graph_def));
@@ -124,9 +122,10 @@ TEST(ImmutableConstantOpTest, ExecutionError) {
   const TensorShape kTestTensorShapeT({1, 4});
 
   auto root = Scope::NewRootScope().ExitOnError();
-  auto node1 = ops::ImmutableConst(root, DT_FLOAT, kBadTensorShape, "test://2");
+  auto node1 =
+      ops::ImmutableConst(root, DT_FLOAT, kBadTensorShape, "test:///2");
   auto node2 =
-      ops::ImmutableConst(root, DT_FLOAT, kTestTensorShapeT, "test://3");
+      ops::ImmutableConst(root, DT_FLOAT, kTestTensorShapeT, "test:///3");
   auto result = ops::MatMul(root, node1, node2);
   GraphDef graph_def;
   TF_ASSERT_OK(root.ToGraphDef(&graph_def));
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index b81b6eb9a12..a35b5889d33 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/io/match.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
@@ -37,9 +36,8 @@ class MatchingFilesOp : public OpKernel {
                     "Input pattern tensor must be scalar, but had shape: ",
                     pattern->shape().DebugString()));
     std::vector<string> fnames;
-    OP_REQUIRES_OK(context,
-                   io::GetMatchingFiles(context->env(),
-                                        pattern->scalar<string>()(), &fnames));
+    OP_REQUIRES_OK(context, context->env()->GetMatchingPaths(
+                                pattern->scalar<string>()(), &fnames));
     const int num_out = fnames.size();
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 40d1b86db32..fc255557886 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -31,7 +31,9 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+#if GOOGLE_CUDA
 typedef Eigen::GpuDevice GPUDevice;
+#endif  // GOOGLE_CUDA
 
 // --------------------------------------------------------------------------
 template <typename Device, typename T>
@@ -107,6 +109,7 @@ class PackOp : public OpKernel {
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             values[i].shaped<T, 2>({before_dim, after_dim})));
       }
+#if GOOGLE_CUDA
       if (std::is_same<Device, GPUDevice>::value) {
         // Switching indexing to int64 might cause performance issues.
         // Hence, we keep int32 indexing in the GPU kernel unless we need to
@@ -116,9 +119,10 @@ class PackOp : public OpKernel {
         } else {
           ConcatGPU64<T>(c->eigen_gpu_device(), inputs_flat, &output_flat);
         }
-      } else {
-        ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+        return;
       }
+#endif  // GOOGLE_CUDA
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
 
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index 5c4155e777e..4d31edbb1a9 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -162,7 +162,7 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
               z[i] = rand[i] * diff + normMin;
             }
             for (int i = 0; i < size; i++) {
-              g[i] = (plusFactor - z[i] * z[i]) / 2.0;
+              g[i] = (plusFactor - z[i] * z[i]) / T(2.0);
             }
 
             const auto u = dist(&gen_copy);
@@ -202,7 +202,7 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
               const T z = -Eigen::numext::log(rand[i]) / alpha + normMin;
               i++;
               const T x = normMin < alpha ? alpha - z : normMin - alpha;
-              const T g = Eigen::numext::exp(-x * x / 2.0);
+              const T g = Eigen::numext::exp(-x * x / T(2.0));
               const T u = rand[i];
               i++;
               if ((u <= g && z < normMax) ||
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 87b84c79a5e..e00764ff803 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -102,8 +102,8 @@ class Pooling3DOp : public UnaryOp<T> {
     // Dimension order for these arrays is: x, y, z.
     std::array<int64, 3> input_size{
         {tensor_in.dim_size(3), tensor_in.dim_size(2), tensor_in.dim_size(1)}};
-    std::array<int64, 3> window({{ksize_[3], ksize_[2], ksize_[1]}});
-    std::array<int64, 3> stride({{stride_[3], stride_[2], stride_[1]}});
+    std::array<int64, 3> window{{ksize_[3], ksize_[2], ksize_[1]}};
+    std::array<int64, 3> stride{{stride_[3], stride_[2], stride_[1]}};
     std::array<int64, 3> padding, out;
 
     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
diff --git a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
index a6f1d7edbaa..3cb0444160b 100644
--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 using tensorflow::random::SimplePhilox;
 
+namespace tensorflow {
 namespace {
 
 // A simple Rectangle class that supplies intersection.
@@ -190,7 +191,6 @@ bool GenerateRandomCrop(int original_width, int original_height,
 }
 }  // namespace
 
-namespace tensorflow {
 template <typename T>
 class SampleDistortedBoundingBoxOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
new file mode 100644
index 00000000000..1730d85facb
--- /dev/null
+++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
@@ -0,0 +1,72 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+static Graph* ConstructSpaceToBatchGraph(
+    const char* op_name, const TensorShape& input_shape, const int block_size,
+    const std::vector<std::pair<int, int>>& paddings) {
+  const int num_block_dims = 2;
+  CHECK_EQ(num_block_dims, paddings.size());
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor input(DT_FLOAT, input_shape);
+  input.flat<float>().setRandom();
+  Tensor paddings_tensor(DT_INT32, TensorShape({num_block_dims, 2}));
+  auto paddings_eigen_tensor = paddings_tensor.matrix<int32>();
+  for (int block_dim = 0; block_dim < num_block_dims; ++block_dim) {
+    paddings_eigen_tensor(block_dim, 0) = paddings[block_dim].first;
+    paddings_eigen_tensor(block_dim, 1) = paddings[block_dim].second;
+  }
+  Node* ret;
+  NodeBuilder(g->NewName("n"), op_name)
+      .Input(test::graph::Constant(g, input))
+      .Input(test::graph::Constant(g, paddings_tensor))
+      .Attr("block_size", block_size)
+      .Finalize(g, &ret);
+  return g;
+}
+
+#define BM_SpaceToBatchDev(OP, DEVICE, B, H, W, D, BS, P00, P01, P10, P11)                    \
+  static void                                                                                 \
+      BM_##OP##_##DEVICE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11( \
+          int iters) {                                                                        \
+    testing::ItemsProcessed(static_cast<int64>(iters) * B * (H + P00 + P01) *                 \
+                            (W + P10 + P11) * D);                                             \
+    test::Benchmark(#DEVICE,                                                                  \
+                    ConstructSpaceToBatchGraph(#OP, TensorShape({B, H, W, D}),                \
+                                               BS, {{P00, P01}, {P10, P11}}))                 \
+        .Run(iters);                                                                          \
+  }                                                                                           \
+  BENCHMARK(                                                                                  \
+      BM_##OP##_##DEVICE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
+#define BM_SpaceToBatch(OP, ...)            \
+  BM_SpaceToBatchDev(OP, cpu, __VA_ARGS__); \
+  BM_SpaceToBatchDev(OP, gpu, __VA_ARGS__);
+
+BM_SpaceToBatch(SpaceToBatch, 64, 100, 100, 64, 2, 0, 0, 0, 0);
+BM_SpaceToBatch(SpaceToBatch, 64, 100, 100, 1, 2, 0, 0, 0, 0);
+BM_SpaceToBatch(SpaceToBatch, 64, 100, 100, 64, 2, 3, 3, 3, 3);
+
+BM_SpaceToBatch(BatchToSpace, 256, 50, 50, 64, 2, 0, 0, 0, 0);
+BM_SpaceToBatch(BatchToSpace, 256, 50, 50, 1, 2, 0, 0, 0, 0);
+BM_SpaceToBatch(BatchToSpace, 256, 50, 50, 64, 2, 3, 3, 3, 3);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spacetobatch_functor.cc b/tensorflow/core/kernels/spacetobatch_functor.cc
new file mode 100644
index 00000000000..23d8a5f9ed4
--- /dev/null
+++ b/tensorflow/core/kernels/spacetobatch_functor.cc
@@ -0,0 +1,168 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Specialization of SpaceToBatchFunctor for a CPUDevice.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/spacetobatch_functor.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+namespace functor {
+
+namespace {
+
+// Implementation of nested loops for SpaceToBatchOpFunctor.
+//
+// To simplify template implementation given lack of constexpr if, both the
+// input and output pointers are non-const.
+template <int N, bool B2S>
+struct SpaceToBatchHelper {
+  template <typename T>
+  static void run(T* space_tensor_ptr, const int64* space_tensor_shape,
+                  const int64* space_tensor_strides, const int64* block_shape,
+                  const int64* pad_start, const int64* block_offsets,
+                  const int64* batch_tensor_shape,
+                  const int64* batch_tensor_strides, T* batch_tensor_ptr) {
+    for (int64 batch_tensor_pos = 0; batch_tensor_pos < batch_tensor_shape[0];
+         ++batch_tensor_pos) {
+      const int64 space_tensor_pos =
+          batch_tensor_pos * block_shape[0] + block_offsets[0] - pad_start[0];
+      if (space_tensor_pos >= 0 && space_tensor_pos < space_tensor_shape[0]) {
+        SpaceToBatchHelper<N - 1, B2S>::run(
+            space_tensor_ptr + space_tensor_pos * space_tensor_strides[0],
+            space_tensor_shape + 1, space_tensor_strides + 1, block_shape + 1,
+            pad_start + 1, block_offsets + 1, batch_tensor_shape + 1,
+            batch_tensor_strides + 1, batch_tensor_ptr);
+      } else {
+        if (B2S == false) {
+          // Copy in padding.
+          for (int64 i = 0; i < batch_tensor_strides[0]; ++i) {
+            batch_tensor_ptr[i] = static_cast<T>(0);
+          }
+        }
+      }
+      batch_tensor_ptr += batch_tensor_strides[0];
+    }
+  }
+};
+
+template <bool B2S>
+struct SpaceToBatchHelper<0, B2S> {
+  template <typename T>
+  static void run(T* space_tensor_ptr, const int64* space_tensor_shape,
+                  const int64* space_tensor_strides, const int64* block_shape,
+                  const int64* pad_start, const int64* block_offsets,
+                  const int64* batch_tensor_shape,
+                  const int64* batch_tensor_strides, T* batch_tensor_ptr) {
+    for (int64 i = 0; i < batch_tensor_strides[-1]; ++i) {
+      if (B2S == false) {
+        batch_tensor_ptr[i] = space_tensor_ptr[i];
+      } else {
+        space_tensor_ptr[i] = batch_tensor_ptr[i];
+      }
+    }
+  }
+};
+
+}  // namespace
+
+template <typename T, int NUM_BLOCK_DIMS, bool B2S>
+struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, B2S> {
+  using SpaceT = typename std::conditional<B2S, T, const T>::type;
+  using BatchT = typename std::conditional<B2S, const T, T>::type;
+  Status operator()(
+      const CPUDevice& d,
+      typename TTypes<SpaceT, NUM_BLOCK_DIMS + 2>::Tensor space_tensor,
+      const int64 block_shape_tensor[NUM_BLOCK_DIMS],
+      const int64 paddings_tensor[NUM_BLOCK_DIMS * 2],
+      typename TTypes<BatchT, NUM_BLOCK_DIMS + 2>::Tensor batch_tensor) {
+    const int64 batch_tensor_batch = batch_tensor.dimension(0);
+
+    const int64 space_tensor_batch = space_tensor.dimension(0);
+
+    // Copy into local array so that the compiler is free to place in a
+    // register.
+    int64 pad_start[NUM_BLOCK_DIMS];
+    int64 block_shape[NUM_BLOCK_DIMS];
+    int64 space_tensor_shape[NUM_BLOCK_DIMS],
+        batch_tensor_shape[NUM_BLOCK_DIMS];
+    for (int block_dim = 0; block_dim < NUM_BLOCK_DIMS; ++block_dim) {
+      pad_start[block_dim] = paddings_tensor[block_dim * 2];
+      block_shape[block_dim] = block_shape_tensor[block_dim];
+      space_tensor_shape[block_dim] = space_tensor.dimension(block_dim + 1);
+      batch_tensor_shape[block_dim] = batch_tensor.dimension(block_dim + 1);
+    }
+
+    int64 space_tensor_strides[NUM_BLOCK_DIMS + 2],
+        batch_tensor_strides[NUM_BLOCK_DIMS + 2];
+    space_tensor_strides[NUM_BLOCK_DIMS + 1] =
+        batch_tensor_strides[NUM_BLOCK_DIMS + 1] = 1;
+    for (int dim = NUM_BLOCK_DIMS; dim >= 0; --dim) {
+      space_tensor_strides[dim] =
+          space_tensor_strides[dim + 1] * space_tensor.dimension(dim + 1);
+      batch_tensor_strides[dim] =
+          batch_tensor_strides[dim + 1] * batch_tensor.dimension(dim + 1);
+    }
+
+    // Use non-const pointers for both input and output to simplify template
+    // implementation given lack of constexpr if.
+    T* space_tensor_ptr = const_cast<T*>(space_tensor.data());
+    T* batch_tensor_ptr = const_cast<T*>(batch_tensor.data());
+
+    for (int64 batch_tensor_b = 0; batch_tensor_b < batch_tensor_batch;
+         ++batch_tensor_b) {
+      const int64 space_tensor_b = batch_tensor_b % space_tensor_batch;
+      int64 block_index = batch_tensor_b / space_tensor_batch;
+      int64 block_offsets[NUM_BLOCK_DIMS];
+      for (int block_dim = NUM_BLOCK_DIMS - 1; block_dim >= 0; --block_dim) {
+        // Skip unnecessary remainder operation for block_dim == 0.
+        block_offsets[block_dim] =
+            block_dim > 0 ? block_index % block_shape[block_dim] : block_index;
+        block_index /= block_shape[block_dim];
+      }
+
+      // The compiler should inline the nested loops generated by this template.
+      SpaceToBatchHelper<NUM_BLOCK_DIMS, B2S>::run(
+          space_tensor_ptr + space_tensor_b * space_tensor_strides[0],
+          space_tensor_shape, &space_tensor_strides[1], block_shape, pad_start,
+          block_offsets, batch_tensor_shape, &batch_tensor_strides[1],
+          batch_tensor_ptr + batch_tensor_b * batch_tensor_strides[0]);
+    }
+    return Status::OK();
+  }
+};
+
+// Instantiate.
+#define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
+  template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, false>; \
+  template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, true>;  \
+/**/
+
+#define INSTANTIATE_FOR_T(T) \
+  TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
+
+TF_CALL_REAL_NUMBER_TYPES(INSTANTIATE_FOR_T)
+
+#undef INSTANTIATE_FOR_T
+#undef INSTANTIATE
+
+}  // namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h
new file mode 100644
index 00000000000..06813650c08
--- /dev/null
+++ b/tensorflow/core/kernels/spacetobatch_functor.h
@@ -0,0 +1,114 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SPACETOBATCH_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_SPACETOBATCH_FUNCTOR_H_
+
+#include <type_traits>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Maximum number of non-collapsible blocked dimensions supported by the
+// {SpaceToBatch,BatchToSpace}ND operation.  To change the limit, modify this
+// constant and the TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS macro definition
+// below.
+constexpr int kMaxSpaceToBatchBlockDims = 4;
+
+// Expands to:
+//   MACRO(1, ## __VA_ARGS__)
+//   ...
+//   MACRO(kMaxSpaceToBatchBlockDims, ## __VA_ARGS__)
+//
+// Note: The space between the number and the comma is necessary for proper GCC
+// comma handling: https://gcc.gnu.org/onlinedocs/cpp/Variadic-Macros.html
+#define TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(MACRO, ...) \
+  MACRO(1 /**/, ##__VA_ARGS__)                              \
+  MACRO(2 /**/, ##__VA_ARGS__)                              \
+  MACRO(3 /**/, ##__VA_ARGS__)                              \
+  MACRO(4 /**/, ##__VA_ARGS__)                              \
+/**/
+
+namespace internal {
+namespace spacetobatch {
+
+template <typename InputType, typename OutputType>
+void SubtleMustCopyFlatHelper(const Tensor& t, OutputType* output) {
+  const int64 num_elements = t.shape().num_elements();
+  output->resize(num_elements);
+  auto eigen_vec = t.flat<InputType>();
+  for (int64 i = 0; i < num_elements; ++i) {
+    (*output)[i] = SubtleMustCopy(eigen_vec(i));
+  }
+}
+
+// Copies flat contents of `t` to std::vector-like `*output`, which is resized
+// as needed.  `OutputType` may be either `std::vector<int64>` or
+// `gtl::InlinedVector<int64>`.
+//
+// Precondition: t.dtype() must be either DT_INT32 or DT_INT64.
+template <typename OutputType>
+void SubtleMustCopyFlat(const Tensor& t, OutputType* output) {
+  if (t.dtype() == DT_INT32) {
+    SubtleMustCopyFlatHelper<int32, OutputType>(t, output);
+  } else {
+    SubtleMustCopyFlatHelper<int64, OutputType>(t, output);
+  }
+}
+
+}  // namespace spacetobatch
+}  // namespace internal
+
+namespace functor {
+
+// Functor used by {SpaceToBatch,BatchToSpace}{ND,}Op to do the conversion.
+//
+// If B2S is false, then this performs the space-to-batch conversion.  If S2B is
+// true, then this performs the inverse batch-to-space conversion.
+template <typename Device, typename T, int NUM_BLOCK_DIMS, bool B2S = false>
+struct SpaceToBatchFunctor {
+  using InputT = typename std::conditional<B2S, T, const T>::type;
+  using OutputT = typename std::conditional<B2S, const T, T>::type;
+  // Implements the space to batch conversion.
+  //
+  // space_tensor: input tensor of space-to-batch operation.  If B2S = false,
+  //     then this is the input to the conversion.  If B2S = true, then this
+  //     is the output of the conversion.
+  // block_size: array of shape [NUM_BLOCK_DIMS] specifying the block sizes for
+  //     dimensions 1 through NUM_BLOCK_DIMS.
+  // paddings: row-major array of shape [NUM_BLOCK_DIMS, 2] specifying the
+  //     start and end padding for dimensions 1 through NUM_BLOCK_DIMS.
+  // batch_tensor: output tensor of the space-to-batch operation.  If
+  //     B2S = false, then this is the output of the conversion.  If B2S = true,
+  //     then this is the input to the conversion.
+  //
+  // The caller must ensure that the dimensions of the tensors are correct.
+  Status operator()(
+      const Device& d,
+      typename TTypes<InputT, NUM_BLOCK_DIMS + 2>::Tensor space_tensor,
+      const int64 block_shape[NUM_BLOCK_DIMS],
+      const int64 paddings[NUM_BLOCK_DIMS * 2],
+      typename TTypes<OutputT, NUM_BLOCK_DIMS + 2>::Tensor batch_tensor);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPACETOBATCH_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
new file mode 100644
index 00000000000..e7848be2e39
--- /dev/null
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -0,0 +1,169 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Specialization of SpaceToBatchFunctor for a GPUDevice.
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/spacetobatch_functor.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Shape and padding parameters for space-to-batch and batch-to-space conversion
+// GPU kernel.
+template <int NUM_BLOCK_DIMS>
+struct S2BParameters {
+  int32 space_tensor_batch;
+  int32 batch_tensor_shape[NUM_BLOCK_DIMS + 2];
+  int32 space_tensor_spatial_shape[NUM_BLOCK_DIMS];
+  int32 pad_start[NUM_BLOCK_DIMS];
+  int32 block_shape[NUM_BLOCK_DIMS];
+};
+
+// GPU kernel for space-to-batch (if B2S = false) and batch-to-space conversion
+// (if B2S = true).
+//
+// To simplify template implementation given lack of constexpr if, both the
+// input and output pointers are non-const.
+template <typename T, int NUM_BLOCK_DIMS, bool B2S>
+__global__ void S2B(const int32 nthreads, T* space_tensor_ptr,
+                    S2BParameters<NUM_BLOCK_DIMS> args, T* batch_tensor_ptr) {
+  CUDA_1D_KERNEL_LOOP(batch_tensor_idx, nthreads) {
+    int32 remaining_batch_tensor_idx = batch_tensor_idx;
+
+    int32 batch_tensor_pos[NUM_BLOCK_DIMS + 2];
+
+    for (int dim = NUM_BLOCK_DIMS + 1; dim >= 1; --dim) {
+      batch_tensor_pos[dim] =
+          remaining_batch_tensor_idx % args.batch_tensor_shape[dim];
+      remaining_batch_tensor_idx /= args.batch_tensor_shape[dim];
+    }
+    batch_tensor_pos[0] = remaining_batch_tensor_idx;
+
+    int32 remaining_block_idx = batch_tensor_pos[0] / args.space_tensor_batch;
+    int32 space_tensor_idx = batch_tensor_pos[NUM_BLOCK_DIMS + 1];
+    int32 space_tensor_stride = args.batch_tensor_shape[NUM_BLOCK_DIMS + 1];
+    const int32 space_tensor_batch_pos =
+        batch_tensor_pos[0] % args.space_tensor_batch;
+    for (int block_dim = NUM_BLOCK_DIMS - 1; block_dim >= 0; --block_dim) {
+      int32 offset = remaining_block_idx;
+      if (block_dim > 0) {
+        offset %= args.block_shape[block_dim];
+      }
+      int32 space_tensor_pos =
+          batch_tensor_pos[block_dim + 1] * args.block_shape[block_dim] +
+          offset - args.pad_start[block_dim];
+      if (space_tensor_pos < 0 ||
+          space_tensor_pos >= args.space_tensor_spatial_shape[block_dim]) {
+        if (B2S == false) {
+          // In the space-to-batch case, write zero padding.
+          batch_tensor_ptr[batch_tensor_idx] = static_cast<T>(0);
+        }
+        break;
+      }
+      space_tensor_idx += space_tensor_stride * space_tensor_pos;
+      space_tensor_stride *= args.space_tensor_spatial_shape[block_dim];
+      if (block_dim == 0) {
+        space_tensor_idx += space_tensor_stride * space_tensor_batch_pos;
+        if (B2S == false) {
+          batch_tensor_ptr[batch_tensor_idx] =
+              ldg(space_tensor_ptr + space_tensor_idx);
+        } else {
+          space_tensor_ptr[space_tensor_idx] =
+              ldg(batch_tensor_ptr + batch_tensor_idx);
+        }
+      }
+      remaining_block_idx /= args.block_shape[block_dim];
+    }
+  }
+}
+
+namespace functor {
+template <typename T, int NUM_BLOCK_DIMS, bool B2S>
+struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
+  using SpaceT = typename std::conditional<B2S, T, const T>::type;
+  using BatchT = typename std::conditional<B2S, const T, T>::type;
+  Status operator()(
+      const GPUDevice& d,
+      typename TTypes<SpaceT, NUM_BLOCK_DIMS + 2>::Tensor space_tensor,
+      const int64 block_shape[NUM_BLOCK_DIMS],
+      const int64 paddings[NUM_BLOCK_DIMS * 2],
+      typename TTypes<BatchT, NUM_BLOCK_DIMS + 2>::Tensor batch_tensor) {
+    // Kernel execution fails if number of elements is zero.
+    if (batch_tensor.size() == 0) {
+      return Status::OK();
+    }
+    S2BParameters<NUM_BLOCK_DIMS> args;
+    args.space_tensor_batch = space_tensor.dimension(0);
+    for (int block_dim = 0; block_dim < NUM_BLOCK_DIMS; ++block_dim) {
+      if (block_shape[block_dim] > std::numeric_limits<int32>::max()) {
+        return errors::InvalidArgument("block_shape value exceeds 2^32-1");
+      }
+      args.block_shape[block_dim] = block_shape[block_dim];
+      if (space_tensor.dimension(block_dim + 1) >
+          std::numeric_limits<int32>::max()) {
+        return errors::InvalidArgument("space_tensor dimension exceeds 2^32-1");
+      }
+      args.space_tensor_spatial_shape[block_dim] =
+          space_tensor.dimension(block_dim + 1);
+      if (paddings[block_dim * 2] > std::numeric_limits<int32>::max()) {
+        return errors::InvalidArgument("paddings/crops value exceeds 2^32-1");
+      }
+      args.pad_start[block_dim] = paddings[block_dim * 2];
+    }
+    int64 total_count = 1;
+    for (int dim = 0; dim < NUM_BLOCK_DIMS + 2; ++dim) {
+      args.batch_tensor_shape[dim] = batch_tensor.dimension(dim);
+      total_count *= args.batch_tensor_shape[dim];
+    }
+    if (total_count > std::numeric_limits<int32>::max()) {
+      return errors::InvalidArgument(
+          "number of batch_tensor elements exceeds 2^32-1");
+    }
+    CudaLaunchConfig config =
+        GetCudaLaunchConfig(static_cast<int32>(total_count), d);
+    S2B<T, NUM_BLOCK_DIMS,
+        B2S><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        config.virtual_thread_count, const_cast<T*>(space_tensor.data()), args,
+        const_cast<T*>(batch_tensor.data()));
+    return Status::OK();
+  }
+};
+
+// Instantiate.
+#define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
+  template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, false>; \
+  template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, true>;  \
+/**/
+
+#define INSTANTIATE_FOR_T(T) \
+  TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(INSTANTIATE_FOR_T)
+
+#undef INSTANTIATE_FOR_T
+#undef INSTANTIATE
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc
index 6526b4741bf..a22c4e8f539 100644
--- a/tensorflow/core/kernels/spacetobatch_op.cc
+++ b/tensorflow/core/kernels/spacetobatch_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/core/kernels/spacetobatch_op.h"
+#include "tensorflow/core/kernels/spacetobatch_functor.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
@@ -39,6 +39,178 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+namespace {
+
+template <typename Device, typename T>
+void SpaceToBatchOpCompute(OpKernelContext* context,
+                           const Tensor& orig_input_tensor,
+                           const Tensor& orig_block_shape,
+                           const Tensor& orig_paddings) {
+  const int input_dims = orig_input_tensor.dims();
+  OP_REQUIRES(
+      context, TensorShapeUtils::IsVector(orig_block_shape.shape()),
+      errors::InvalidArgument("block_shape rank should be 1 instead of ",
+                              orig_block_shape.dims()));
+
+  const int block_dims = orig_block_shape.dim_size(0);
+  OP_REQUIRES(
+      context, orig_input_tensor.dims() >= 1 + block_dims,
+      errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
+                              " instead of ", orig_input_tensor.dims()));
+
+  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
+                           block_dims == orig_paddings.dim_size(0) &&
+                           2 == orig_paddings.dim_size(1),
+              errors::InvalidArgument("paddings should have shape [",
+                                      block_dims, ", 2] instead of ",
+                                      orig_paddings.shape().DebugString()));
+
+  // To avoid out-of-bounds access in the case that the block_shape and/or
+  // paddings tensors are concurrently modified, we must copy the values.
+  gtl::InlinedVector<int64, 4> block_shape;
+  gtl::InlinedVector<int64, 8> paddings;
+  internal::spacetobatch::SubtleMustCopyFlat(orig_block_shape, &block_shape);
+  internal::spacetobatch::SubtleMustCopyFlat(orig_paddings, &paddings);
+
+  // Determine the length of the prefix of block dims that can be combined
+  // into the batch dimension due to having no padding and block_shape=1.
+  int removed_prefix_block_dims = 0;
+  for (; removed_prefix_block_dims < block_dims; ++removed_prefix_block_dims) {
+    const int dim = removed_prefix_block_dims;
+    if (paddings[2 * dim] != 0 || paddings[2 * dim + 1] != 0 ||
+        block_shape[dim] != 1) {
+      break;
+    }
+  }
+
+  // Determine the length of the suffix of block dims that can be combined
+  // into the depth dimension due to having no padding and block_shape=1.
+  int removed_suffix_block_dims = 0;
+  for (; removed_suffix_block_dims < block_dims - removed_prefix_block_dims;
+       ++removed_suffix_block_dims) {
+    const int dim = block_dims - 1 - removed_suffix_block_dims;
+    if (paddings[dim * 2] != 0 || paddings[dim * 2 + 1] != 0 ||
+        block_shape[dim] != 1) {
+      break;
+    }
+  }
+
+  // Compute the product of the block_shape values.
+  int64 block_shape_product = 1;
+  for (int block_dim = 0; block_dim < block_dims; ++block_dim) {
+    block_shape_product *= block_shape[block_dim];
+  }
+
+  const int internal_block_dims =
+      block_dims - removed_prefix_block_dims - removed_suffix_block_dims;
+  OP_REQUIRES(context, internal_block_dims <= kMaxSpaceToBatchBlockDims,
+              errors::InvalidArgument(
+                  "Maximum number of non-combined block dimensions is ",
+                  internal_block_dims, " but must not exceed ",
+                  kMaxSpaceToBatchBlockDims));
+
+  if (internal_block_dims == 0) {
+    context->set_output(0, orig_input_tensor);
+    return;
+  }
+
+  // For the purpose of computing the result, the input will be treated as
+  // having this shape, of rank 2 + internal_block_dims.
+  TensorShape internal_input_shape;
+
+  // For the purpose of computing the result, the output will be treated as
+  // having this shape, of rank 2 + internal_block_dims.
+  TensorShape internal_output_shape;
+
+  // The actual output shape exposed to callers.
+  TensorShape external_output_shape;
+
+  external_output_shape.AddDim(orig_input_tensor.dim_size(0) *
+                               block_shape_product);
+
+  int64 input_batch_size = orig_input_tensor.dim_size(0);
+  for (int block_dim = 0; block_dim < removed_prefix_block_dims; ++block_dim) {
+    const int64 size = orig_input_tensor.dim_size(block_dim + 1);
+    input_batch_size *= size;
+    external_output_shape.AddDim(size);
+  }
+  internal_input_shape.AddDim(input_batch_size);
+  internal_output_shape.AddDim(input_batch_size * block_shape_product);
+
+  for (int block_dim = removed_prefix_block_dims;
+       block_dim < block_dims - removed_suffix_block_dims; ++block_dim) {
+    const int64 pad_start = paddings[2 * block_dim],
+                pad_end = paddings[2 * block_dim + 1];
+    OP_REQUIRES(context, pad_start >= 0 && pad_end >= 0,
+                errors::InvalidArgument("Paddings must be non-negative"));
+    const int64 input_size = orig_input_tensor.dim_size(block_dim + 1);
+    const int64 block_shape_value = block_shape[block_dim];
+    const int64 padded_size = input_size + pad_start + pad_end;
+    OP_REQUIRES(
+        context, padded_size % block_shape_value == 0,
+        errors::InvalidArgument("padded_shape[", block_dim, "]=", padded_size,
+                                " is not divisible by block_shape[", block_dim,
+                                "]=", block_shape_value));
+    internal_input_shape.AddDim(input_size);
+    const int64 output_size = padded_size / block_shape_value;
+    internal_output_shape.AddDim(output_size);
+    external_output_shape.AddDim(output_size);
+  }
+
+  int64 depth = 1;
+  for (int dim = block_dims - removed_suffix_block_dims + 1; dim < input_dims;
+       ++dim) {
+    const int64 size = orig_input_tensor.dim_size(dim);
+    external_output_shape.AddDim(size);
+    depth *= size;
+  }
+  internal_input_shape.AddDim(depth);
+  internal_output_shape.AddDim(depth);
+
+  // Allocate output tensor.
+  Tensor* output_tensor = nullptr;
+  OP_REQUIRES_OK(context, context->allocate_output(0, external_output_shape,
+                                                   &output_tensor));
+
+  const int64* internal_paddings = &paddings[2 * removed_prefix_block_dims];
+  const int64* internal_block_shape = &block_shape[removed_prefix_block_dims];
+
+  switch (internal_block_dims) {
+#define TF_SPACETOBATCH_BLOCK_DIMS_CASE(NUM_BLOCK_DIMS)                    \
+  case NUM_BLOCK_DIMS: {                                                   \
+    OP_REQUIRES_OK(                                                        \
+        context,                                                           \
+        (functor::SpaceToBatchFunctor<Device, T, NUM_BLOCK_DIMS, false>()( \
+            context->eigen_device<Device>(),                               \
+            orig_input_tensor.shaped<T, NUM_BLOCK_DIMS + 2>(               \
+                internal_input_shape.dim_sizes()),                         \
+            internal_block_shape, internal_paddings,                       \
+            output_tensor->shaped<T, NUM_BLOCK_DIMS + 2>(                  \
+                internal_output_shape.dim_sizes()))));                     \
+  } break;                                                                 \
+    /**/
+    TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(TF_SPACETOBATCH_BLOCK_DIMS_CASE)
+#undef TF_SPACETOBATCH_BLOCK_DIMS_CASE
+  }
+}
+
+}  // namespace
+
+template <typename Device, typename T>
+class SpaceToBatchNDOp : public OpKernel {
+ public:
+  explicit SpaceToBatchNDOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& orig_input_tensor = context->input(0);
+    const Tensor& orig_block_shape = context->input(1);
+    const Tensor& orig_paddings = context->input(2);
+    SpaceToBatchOpCompute<Device, T>(context, orig_input_tensor,
+                                     orig_block_shape, orig_paddings);
+  }
+};
+
 template <typename Device, typename T>
 class SpaceToBatchOp : public OpKernel {
  public:
@@ -47,6 +219,12 @@ class SpaceToBatchOp : public OpKernel {
     OP_REQUIRES(
         context, block_size_ > 1,
         errors::InvalidArgument("Block size should be > 1: ", block_size_));
+    // We don't use context->allocate_persistent because the allocation must
+    // happen on the CPU regardless of Device.
+    block_shape_ = Tensor(tensorflow::DT_INT64, TensorShape({2}));
+    auto block_shape_vec = block_shape_.vec<int64>();
+    block_shape_vec(0) = block_size_;
+    block_shape_vec(1) = block_size_;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -54,133 +232,46 @@ class SpaceToBatchOp : public OpKernel {
     const Tensor& in1 = context->input(1);
     const int dims = in0.dims();
 
-    // Check on the input dimensions first.
-    // The input is presumed to be [batch, height, width, depth]
     static const int kRequiredDims = 4;
     OP_REQUIRES(context, kRequiredDims == dims,
                 errors::InvalidArgument("Input rank should be: ", kRequiredDims,
                                         "instead of: ", dims));
-
-    // The paddings is presumed to be [2, 2].
-    OP_REQUIRES(
-        context,
-        TensorShapeUtils::IsMatrix(in1.shape()) &&
-        in1.dim_size(0) == 2 && in1.dim_size(1) == 2,
-        errors::InvalidArgument("paddings must be a 2 x 2 matrix: ",
-                                in1.shape().DebugString()));
-    TTypes<int32>::ConstMatrix paddings = in1.matrix<int32>();
-    OP_REQUIRES(context,
-                paddings(0, 0) >= 0 && paddings(0, 1) >= 0 &&
-                paddings(1, 0) >= 0 && paddings(1, 1) >= 0,
-                errors::InvalidArgument("Paddings must be non-negative"));
-
-    // Compute the shape of the zero-padded input tensor.
-    TensorShape padded_shape;
-    padded_shape.AddDim(in0.dim_size(0));
-    padded_shape.AddDim(paddings(0, 0) + in0.dim_size(1) + paddings(0, 1));
-    padded_shape.AddDim(paddings(1, 0) + in0.dim_size(2) + paddings(1, 1));
-    padded_shape.AddDim(in0.dim_size(3));
-
-    const int batch = padded_shape.dim_size(0);
-    const int height = padded_shape.dim_size(1);
-    const int width = padded_shape.dim_size(2);
-    const int depth = padded_shape.dim_size(3);
-
-    // Both height and width must be divisible by block_size.
-    OP_REQUIRES(
-        context, height % block_size_ == 0 && width % block_size_ == 0,
-        errors::InvalidArgument("Image height ", height, " and width ", width,
-                                "should be divisible by block_size: ",
-                                block_size_));
-
-    const int block_size_sq = block_size_ * block_size_;
-
-    // The 'spatial' block of size block_size_ X block_size_ will be moved
-    // to batch.
-    const int output_batch = batch * block_size_sq;
-    const int output_height = height / block_size_;
-    const int output_width = width / block_size_;
-
-    // Allocate output tensor.
-    Tensor* outputs_tensor = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({output_batch, output_height,
-                                                output_width, depth}),
-                                &outputs_tensor));
-
-    typename TTypes<T, 4>::ConstTensor Tinput = in0.tensor<T, 4>();
-    typename TTypes<T, 4>::Tensor Toutput = outputs_tensor->tensor<T, 4>();
-
-    functor::SpaceToBatchOpFunctor<Device, T> functor;
-    functor(context->eigen_device<Device>(),
-            Tinput, paddings, block_size_, Toutput);
-  };
+    SpaceToBatchOpCompute<Device, T>(context, in0, block_shape_, in1);
+  }
 
  private:
   int block_size_;
+  Tensor block_shape_;
 };
 
-// Partial specialization of SpaceToBatchOpFunctor for a CPUDevice.
-namespace functor {
-template <typename T>
-struct SpaceToBatchOpFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
-                  typename TTypes<int32>::ConstMatrix paddings,
-                  int block_size, typename TTypes<T, 4>::Tensor output) {
-    const int output_batch = output.dimension(0);
-    const int output_height = output.dimension(1);
-    const int output_width = output.dimension(2);
-    const int depth = output.dimension(3);
-
-    const int input_batch = input.dimension(0);
-    const int input_height = input.dimension(1);
-    const int input_width = input.dimension(2);
-
-    const int pad_top = paddings(0, 0);
-    const int pad_left = paddings(1, 0);
-
-    for (int out_b = 0; out_b < output_batch; ++out_b) {
-      // out_b = (offset_h * block_size + offset_w) * input_batch + in_b
-      const int in_b = out_b % input_batch;
-      const int offset_w = (out_b / input_batch) % block_size;
-      const int offset_h = (out_b / input_batch) / block_size;
-      for (int out_h = 0; out_h < output_height; ++out_h) {
-        const int in_h = out_h * block_size + offset_h - pad_top;
-        for (int out_w = 0; out_w < output_width; ++out_w) {
-          const int in_w = out_w * block_size + offset_w - pad_left;
-          if (in_h >= 0 && in_w >= 0 &&
-              in_h < input_height && in_w < input_width) {
-            for (int d = 0; d < depth; ++d) {
-              output(out_b, out_h, out_w, d) = input(in_b, in_h, in_w, d);
-            }
-          } else {
-            for (int d = 0; d < depth; ++d) {
-              output(out_b, out_h, out_w, d) = static_cast<T>(0);
-            }
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace functor
-
-#define REGISTER(T)                                                     \
-  REGISTER_KERNEL_BUILDER(Name("SpaceToBatch")                          \
-                              .Device(DEVICE_CPU)                       \
-                              .TypeConstraint<T>("T")                   \
-                              .HostMemory("paddings"),                  \
+#define REGISTER(T)                                        \
+  REGISTER_KERNEL_BUILDER(Name("SpaceToBatchND")           \
+                              .Device(DEVICE_CPU)          \
+                              .TypeConstraint<T>("T")      \
+                              .HostMemory("block_shape")   \
+                              .HostMemory("paddings"),     \
+                          SpaceToBatchNDOp<CPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("SpaceToBatch")             \
+                              .Device(DEVICE_CPU)          \
+                              .TypeConstraint<T>("T")      \
+                              .HostMemory("paddings"),     \
                           SpaceToBatchOp<CPUDevice, T>);
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(T)                                                     \
-  REGISTER_KERNEL_BUILDER(Name("SpaceToBatch")                          \
-                              .Device(DEVICE_GPU)                       \
-                              .TypeConstraint<T>("T")                   \
-                              .HostMemory("paddings"),                  \
+#define REGISTER(T)                                        \
+  REGISTER_KERNEL_BUILDER(Name("SpaceToBatchND")           \
+                              .Device(DEVICE_GPU)          \
+                              .TypeConstraint<T>("T")      \
+                              .HostMemory("block_shape")   \
+                              .HostMemory("paddings"),     \
+                          SpaceToBatchNDOp<GPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("SpaceToBatch")             \
+                              .Device(DEVICE_GPU)          \
+                              .TypeConstraint<T>("T")      \
+                              .HostMemory("paddings"),     \
                           SpaceToBatchOp<GPUDevice, T>);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER);
diff --git a/tensorflow/core/kernels/spacetobatch_op.h b/tensorflow/core/kernels/spacetobatch_op.h
deleted file mode 100644
index 9773020639f..00000000000
--- a/tensorflow/core/kernels/spacetobatch_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_KERNELS_SPACETOBATCH_OP_H_
-#define TENSORFLOW_CORE_KERNELS_SPACETOBATCH_OP_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace functor {
-
-// Functor used by SpaceToBatchOp to do the computations.
-template <typename Device, typename T>
-struct SpaceToBatchOpFunctor {
-  // Implements the space to batch conversion.
-  //
-  // input: 4-D input tensor.
-  // paddings: [2, 2] matrix specifying the padding.
-  // block_size: block size for the conversion.
-  // output: 4-D output tensor.
-  //
-  // The dimensions of the tensors are guaranteed to be right when the
-  // functor is called.
-  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
-                  typename TTypes<int32>::ConstMatrix paddings,
-                  int block_size, typename TTypes<T, 4>::Tensor output);
-};
-
-}  // namespace functor
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_SPACETOBATCH_OP_H_
diff --git a/tensorflow/core/kernels/spacetobatch_op_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_op_gpu.cu.cc
deleted file mode 100644
index 90fca2325bb..00000000000
--- a/tensorflow/core/kernels/spacetobatch_op_gpu.cu.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-
-#define EIGEN_USE_GPU
-
-#include "tensorflow/core/kernels/spacetobatch_op.h"
-
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
-
-namespace tensorflow {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-template <typename T>
-__global__ void S2B(
-    const int32 nthreads, const T* input_ptr,
-    const int block_size, const int pad_top, const int pad_left,
-    const int output_batch, const int output_height, const int output_width,
-    const int depth, const int input_batch, const int input_height,
-    const int input_width, T* output_ptr) {
-  CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
-    // out_idx = d + depth * (w + output_width * (h + output_height * b))
-    const int d = out_idx % depth;
-    const int out_idx2 = out_idx / depth;
-    const int w = out_idx2 % output_width;
-    const int out_idx3 = out_idx2 / output_width;
-    const int h = out_idx3 % output_height;
-    const int b = out_idx3 / output_height;
-
-    const int in_b = b % input_batch;
-    const int offset_w = (b / input_batch) % block_size;
-    const int offset_h = (b / input_batch) / block_size;
-    const int in_h = h * block_size + offset_h - pad_top;
-    const int in_w = w * block_size + offset_w - pad_left;
-
-    if (in_h >= 0 && in_w >= 0 && in_h < input_height && in_w < input_width) {
-      const int inp_idx =
-          d + depth * (in_w + input_width * (in_h + input_height * in_b));
-      output_ptr[out_idx] = ldg(input_ptr + inp_idx);
-    } else {
-      output_ptr[out_idx] = static_cast<T>(0);
-    }
-  }
-}
-
-// Specialization of SpaceToBatchOpFunctor for a GPUDevice.
-namespace functor {
-template <typename T>
-struct SpaceToBatchOpFunctor<GPUDevice, T> {
-  void operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
-                  typename TTypes<int32>::ConstMatrix paddings,
-                  int block_size, typename TTypes<T, 4>::Tensor output) {
-    const int output_batch = output.dimension(0);
-    const int output_height = output.dimension(1);
-    const int output_width = output.dimension(2);
-    const int depth = output.dimension(3);
-
-    const int input_batch = input.dimension(0);
-    const int input_height = input.dimension(1);
-    const int input_width = input.dimension(2);
-
-    const int pad_top = paddings(0, 0);
-    const int pad_left = paddings(1, 0);
-
-    const int total_count =
-        output_batch * output_height * output_width * depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    S2B<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input.data(), block_size, pad_top,
-        pad_left, output_batch, output_height, output_width, depth,
-        input_batch, input_height, input_width, output.data());
-  }
-};
-}  // end namespace functor
-
-// Instantiate the GPU implementation.
-template struct functor::SpaceToBatchOpFunctor<GPUDevice, float>;
-template struct functor::SpaceToBatchOpFunctor<GPUDevice, double>;
-
-}  // end namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/sparse_concat_op.cc b/tensorflow/core/kernels/sparse_concat_op.cc
index 49347389e05..4fd91f5305b 100644
--- a/tensorflow/core/kernels/sparse_concat_op.cc
+++ b/tensorflow/core/kernels/sparse_concat_op.cc
@@ -35,7 +35,7 @@ template <typename T>
 class SparseConcatOp : public OpKernel {
  public:
   explicit SparseConcatOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("concat_dim", &concat_dim_));
+    OP_REQUIRES_OK(context, context->GetAttr("concat_dim", &concat_dim_attr_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -74,19 +74,23 @@ class SparseConcatOp : public OpKernel {
     }
 
     const TensorShape input_shape(shapes[0].vec<int64>());
-    OP_REQUIRES(
-        context, concat_dim_ >= 0 && concat_dim_ < input_shape.dims(),
-        errors::InvalidArgument("Concat dimension must be between 0 and rank (",
-                                input_shape.dims(), "), got ", concat_dim_));
+    const int input_rank = input_shape.dims();
+    const int concat_dim = (concat_dim_attr_ < 0)
+                               ? input_rank + concat_dim_attr_
+                               : concat_dim_attr_;
+    OP_REQUIRES(context, concat_dim >= 0 && concat_dim < input_rank,
+                errors::InvalidArgument("Concat dimension must be in range [",
+                                        -input_rank, ", ", input_rank,
+                                        "), got ", concat_dim_attr_));
     for (int i = 1; i < N; ++i) {
       const TensorShape current_shape(shapes[i].vec<int64>());
-      OP_REQUIRES(context, current_shape.dims() == input_shape.dims(),
-                  errors::InvalidArgument(
-                      "Ranks of all input tensors must match: expected ",
-                      input_shape.dims(), " but got ", current_shape.dims(),
-                      " at position ", i));
-      for (int j = 0; j < input_shape.dims(); ++j) {
-        if (j != concat_dim_) {
+      OP_REQUIRES(
+          context, current_shape.dims() == input_rank,
+          errors::InvalidArgument(
+              "Ranks of all input tensors must match: expected ", input_rank,
+              " but got ", current_shape.dims(), " at position ", i));
+      for (int j = 0; j < input_rank; ++j) {
+        if (j != concat_dim) {
           OP_REQUIRES(
               context, input_shape.dim_size(j) == current_shape.dim_size(j),
               errors::InvalidArgument(
@@ -105,14 +109,14 @@ class SparseConcatOp : public OpKernel {
     // reorder doesn't create race conditions for other ops that may be
     // concurrently reading the indices and values tensors.
 
-    gtl::InlinedVector<int64, 8> std_order(input_shape.dims());
+    gtl::InlinedVector<int64, 8> std_order(input_rank);
     std::iota(std_order.begin(), std_order.end(), 0);
 
     std::vector<int64> concat_order;
-    concat_order.reserve(input_shape.dims());
-    concat_order.push_back(concat_dim_);
-    for (int j = 0; j < input_shape.dims(); ++j) {
-      if (j != concat_dim_) {
+    concat_order.reserve(input_rank);
+    concat_order.push_back(concat_dim);
+    for (int j = 0; j < input_rank; ++j) {
+      if (j != concat_dim) {
         concat_order.push_back(j);
       }
     }
@@ -143,7 +147,7 @@ class SparseConcatOp : public OpKernel {
   }
 
  private:
-  int concat_dim_;
+  int concat_dim_attr_;
 };
 
 #define REGISTER_KERNELS(type)                                           \
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op.cc b/tensorflow/core/kernels/sparse_reduce_sum_op.cc
index fcc29ab0563..a7e35afe087 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op.cc
@@ -24,6 +24,9 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
+// TODO(b/31496047): Fix non-standard include order.
+#include <numeric>  // clang-format off
+
 using tensorflow::sparse::SparseTensor;
 using tensorflow::gtl::ArraySlice;
 
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index 48124d20af9..9c39841feec 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -17,17 +17,36 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/framework/op_kernel.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/sparse_xent_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+template <typename Index>
+Status CheckInvalidLabelIndex(const Tensor& labels, int64 max_index) {
+  if (labels.NumElements() == 0) return Status::OK();
+  const auto label_values = labels.vec<Index>();
+  int64 bad_index;
+  auto min_max_dim_value = std::minmax_element(
+      label_values.data(), label_values.data() + label_values.size());
+  if (*min_max_dim_value.first < 0 || *min_max_dim_value.second >= max_index) {
+    bad_index = (*min_max_dim_value.first < 0) ? *min_max_dim_value.first
+                                               : *min_max_dim_value.second;
+    return errors::InvalidArgument("Received a label value of ", bad_index,
+                                   " which is outside the valid range of [0, ",
+                                   max_index, ").  Label values: ",
+                                   labels.SummarizeValue(labels.NumElements()));
+  }
+  return Status::OK();
+}
+
 template <typename Device, typename T, typename Index>
 class SparseSoftmaxXentWithLogitsOp : public OpKernel {
  public:
@@ -66,6 +85,10 @@ class SparseSoftmaxXentWithLogitsOp : public OpKernel {
                    context->allocate_output(1, logits.shape(), &back_out));
 
     if (logits.dim_size(0) > 0) {
+      if (std::is_same<Device, CPUDevice>::value) {
+        OP_REQUIRES_OK(
+            context, CheckInvalidLabelIndex<Index>(labels, logits.dim_size(1)));
+      }
       functor::SparseXentFunctor<Device, T, Index> functor;
       functor(context->eigen_device<Device>(), logits.matrix<T>(),
               labels.vec<Index>(), scratch.vec<T>(), loss_out->vec<T>(),
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index af75fe92c9a..d8596ba9ea0 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -86,17 +86,17 @@ class SummaryHistoOp : public OpKernel {
     // Build histogram of values in "values" tensor
     histogram::Histogram histo;
     for (int64 i = 0; i < flat.size(); i++) {
-      T v = flat(i);
-      if (Eigen::numext::isnan(v)) {
+      const double double_val = static_cast<double>(flat(i));
+      if (Eigen::numext::isnan(double_val)) {
         c->SetStatus(
             errors::InvalidArgument("Nan in summary histogram for: ", name()));
         break;
-      } else if (Eigen::numext::isinf(v)) {
+      } else if (Eigen::numext::isinf(double_val)) {
         c->SetStatus(errors::InvalidArgument(
             "Infinity in summary histogram for: ", name()));
         break;
       }
-      histo.Add(static_cast<double>(v));
+      histo.Add(double_val);
     }
 
     Summary s;
diff --git a/tensorflow/core/kernels/tensor_array.cc b/tensorflow/core/kernels/tensor_array.cc
index dc1b14ec365..ad3f7cb1e55 100644
--- a/tensorflow/core/kernels/tensor_array.cc
+++ b/tensorflow/core/kernels/tensor_array.cc
@@ -79,7 +79,7 @@ std::atomic<int64> TensorArray::tensor_array_counter{0};
 
 Status TensorArray::CopyShapesFrom(TensorArray* rhs) {
   mutex_lock l(mu_);
-  mutex_lock l_rhs(*rhs->mu());
+  mutex_lock l_rhs(rhs->mu_);
   TF_RETURN_IF_ERROR(LockedReturnIfClosed());
   TF_RETURN_IF_ERROR(rhs->LockedReturnIfClosed());
   if (tensors_.size() != rhs->tensors_.size()) {
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 2a6b34ea309..cbb7f1cca79 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -41,7 +41,9 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+#if GOOGLE_CUDA
 typedef Eigen::GpuDevice GPUDevice;
+#endif  // GOOGLE_CUDA
 
 namespace tensorflow {
 
@@ -517,6 +519,7 @@ class TensorArrayPackOrGatherOp : public OpKernel {
           new ConstMatrix(value_t->shaped<T, 2>({1, value_t->NumElements()})));
     }
 
+#if GOOGLE_CUDA
     if (std::is_same<Device, GPUDevice>::value) {
       // Switching indexing to int64 might cause performance issues.
       // Hence, we keep int32 indexing in the GPU kernel unless we need to
@@ -528,9 +531,10 @@ class TensorArrayPackOrGatherOp : public OpKernel {
         ConcatGPU64<T>(ctx->eigen_gpu_device(), input_tensors_flat,
                        &output_flat);
       }
-    } else {
-      ConcatCPU<T>(ctx->device(), input_tensors_flat, &output_flat);
+      return;
     }
+#endif  // GOOGLE_CUDA
+    ConcatCPU<T>(ctx->device(), input_tensors_flat, &output_flat);
   }
 
  private:
@@ -716,6 +720,7 @@ class TensorArrayConcatOp : public OpKernel {
     if (output_shape.num_elements() > 0) {
       auto output_flat =
           output_tensor->shaped<T, 2>({1, output_shape.num_elements()});
+#if GOOGLE_CUDA
       if (std::is_same<Device, GPUDevice>::value) {
         // Switching indexing to int64 might cause performance issues.
         // Hence, we keep int32 indexing in the GPU kernel unless we need to
@@ -727,9 +732,10 @@ class TensorArrayConcatOp : public OpKernel {
           ConcatGPU64<T>(ctx->eigen_gpu_device(), input_tensors_flat,
                          &output_flat);
         }
-      } else {
-        ConcatCPU<T>(ctx->device(), input_tensors_flat, &output_flat);
+        return;
       }
+#endif  // GOOGLE_CUDA
+      ConcatCPU<T>(ctx->device(), input_tensors_flat, &output_flat);
     }
   }
 
diff --git a/tensorflow/core/kernels/tf_record_reader_op.cc b/tensorflow/core/kernels/tf_record_reader_op.cc
index 30679069bab..e169498fd3d 100644
--- a/tensorflow/core/kernels/tf_record_reader_op.cc
+++ b/tensorflow/core/kernels/tf_record_reader_op.cc
@@ -38,11 +38,8 @@ class TFRecordReader : public ReaderBase {
     offset_ = 0;
     TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file_));
 
-    io::RecordReaderOptions options;
-    if (compression_type_ == "ZLIB") {
-      options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
-    }
-
+    io::RecordReaderOptions options =
+        io::RecordReaderOptions::CreateRecordReaderOptions(compression_type_);
     reader_.reset(new io::RecordReader(file_.get(), options));
     return Status::OK();
   }
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index 21af965da2e..640bbf65979 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -441,7 +441,7 @@ class InlinedVector {
     DiscardStorage();
 
     u_.data[kSize - 1] = kSentinel;
-    u_.data[kSize - 2] = target_lg;
+    u_.data[kSize - 2] = static_cast<unsigned char>(target_lg);
     set_size_internal(s);
     DCHECK_EQ(capacity(), target);
     set_outofline_pointer(dst);
diff --git a/tensorflow/core/lib/io/inputbuffer.h b/tensorflow/core/lib/io/inputbuffer.h
index 02dfda74e3f..94a8cfd39be 100644
--- a/tensorflow/core/lib/io/inputbuffer.h
+++ b/tensorflow/core/lib/io/inputbuffer.h
@@ -73,6 +73,9 @@ class InputBuffer {
   // Returns the position in the file.
   int64 Tell() const { return file_pos_ - (limit_ - pos_); }
 
+  // Returns the underlying RandomAccessFile.
+  RandomAccessFile* file() const { return file_; }
+
  private:
   Status FillBuffer();
 
diff --git a/tensorflow/core/lib/io/match.cc b/tensorflow/core/lib/io/match.cc
deleted file mode 100644
index 530c063c495..00000000000
--- a/tensorflow/core/lib/io/match.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/lib/io/match.h"
-#include <fnmatch.h>
-#include <vector>
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/platform/env.h"
-
-namespace tensorflow {
-namespace io {
-
-Status GetMatchingFiles(Env* env, const string& pattern,
-                        std::vector<string>* results) {
-  results->clear();
-  std::vector<string> all_files;
-  string dir = Dirname(pattern).ToString();
-  if (dir.empty()) dir = ".";
-  string basename_pattern = Basename(pattern).ToString();
-  Status s = env->GetChildren(dir, &all_files);
-  if (!s.ok()) {
-    return s;
-  }
-  for (const auto& f : all_files) {
-    int flags = 0;
-    if (fnmatch(basename_pattern.c_str(), Basename(f).ToString().c_str(),
-                flags) == 0) {
-      results->push_back(JoinPath(dir, f));
-    }
-  }
-  return Status::OK();
-}
-
-}  // namespace io
-}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/match.h b/tensorflow/core/lib/io/match.h
deleted file mode 100644
index 0f2089964be..00000000000
--- a/tensorflow/core/lib/io/match.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LIB_IO_MATCH_H_
-#define TENSORFLOW_LIB_IO_MATCH_H_
-
-#include <vector>
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/platform/env.h"
-
-namespace tensorflow {
-class Env;
-namespace io {
-
-// Given a pattern, return the set of files that match the pattern.
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.  If
-// successful, return Status::OK and store the matching files in
-// "*results".  Otherwise, return a non-OK status.
-Status GetMatchingFiles(Env* env, const string& pattern,
-                        std::vector<string>* results);
-
-}  // namespace io
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_LIB_IO_MATCH_H_
diff --git a/tensorflow/core/lib/io/match_test.cc b/tensorflow/core/lib/io/match_test.cc
deleted file mode 100644
index 9871382bcc9..00000000000
--- a/tensorflow/core/lib/io/match_test.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/lib/io/match.h"
-#include <algorithm>
-#include <vector>
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace io {
-
-static string Match(Env* env, const string& suffix_pattern) {
-  std::vector<string> results;
-  // Testing with unclean paths (// instead of /)
-  Status s = GetMatchingFiles(
-      env, strings::StrCat(testing::TmpDir(), "//", suffix_pattern), &results);
-  if (!s.ok()) {
-    return s.ToString();
-  } else {
-    string r;
-    std::sort(results.begin(), results.end());
-    for (size_t i = 0; i < results.size(); i++) {
-      strings::StrAppend(&r, (i > 0) ? "," : "", Basename(results[i]));
-    }
-    return r;
-  }
-}
-TEST(GetMatchingFiles, Simple) {
-  Env* env = Env::Default();
-  EXPECT_EQ(Match(env, "thereisnosuchfile"), "");
-  EXPECT_EQ(Match(env, "thereisnosuchfile*"), "");
-
-  // Populate a few files
-  TF_EXPECT_OK(WriteStringToFile(Env::Default(),
-                                 JoinPath(testing::TmpDir(), "match-00"), ""));
-  TF_EXPECT_OK(WriteStringToFile(Env::Default(),
-                                 JoinPath(testing::TmpDir(), "match-0a"), ""));
-  TF_EXPECT_OK(WriteStringToFile(Env::Default(),
-                                 JoinPath(testing::TmpDir(), "match-01"), ""));
-  TF_EXPECT_OK(WriteStringToFile(Env::Default(),
-                                 JoinPath(testing::TmpDir(), "match-aaa"), ""));
-
-  EXPECT_EQ(Match(env, "match-*"), "match-00,match-01,match-0a,match-aaa");
-  EXPECT_EQ(Match(env, "match-0[0-9]"), "match-00,match-01");
-  EXPECT_EQ(Match(env, "match-?[0-9]"), "match-00,match-01");
-  EXPECT_EQ(Match(env, "match-?a*"), "match-0a,match-aaa");
-  EXPECT_EQ(Match(env, "match-??"), "match-00,match-01,match-0a");
-}
-
-}  // namespace io
-}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 22801859e88..8cc9d9154c8 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -26,6 +26,32 @@ limitations under the License.
 namespace tensorflow {
 namespace io {
 
+RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
+    const string& compression_type) {
+  RecordReaderOptions options;
+  if (compression_type == "ZLIB") {
+    options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
+#if defined(IS_SLIM_BUILD)
+    LOG(ERROR) << "Compression is not supported but compression_type is set."
+               << " No compression will be used.";
+#else
+    options.zlib_options = io::ZlibCompressionOptions::DEFAULT();
+#endif  // IS_SLIM_BUILD
+  } else if (compression_type == "GZIP") {
+    options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
+#if defined(IS_SLIM_BUILD)
+    LOG(ERROR) << "Compression is not supported but compression_type is set."
+               << " No compression will be used.";
+#else
+    options.zlib_options = io::ZlibCompressionOptions::GZIP();
+#endif  // IS_SLIM_BUILD
+  } else if (compression_type != "") {
+    LOG(ERROR) << "Unsupported compression_type:" << compression_type
+               << ". No comprression will be used.";
+  }
+  return options;
+}
+
 RecordReader::RecordReader(RandomAccessFile* file,
                            const RecordReaderOptions& options)
     : src_(file), options_(options) {
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index fb675ac98f5..6c92b149637 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -37,6 +37,9 @@ class RecordReaderOptions {
   enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
   CompressionType compression_type = NONE;
 
+  static RecordReaderOptions CreateRecordReaderOptions(
+      const string& compression_type);
+
 #if !defined(IS_SLIM_BUILD)
   // Options specific to zlib compression.
   ZlibCompressionOptions zlib_options;
diff --git a/tensorflow/core/lib/io/record_writer.cc b/tensorflow/core/lib/io/record_writer.cc
index 516332d2b73..175bfbd827c 100644
--- a/tensorflow/core/lib/io/record_writer.cc
+++ b/tensorflow/core/lib/io/record_writer.cc
@@ -21,6 +21,31 @@ limitations under the License.
 
 namespace tensorflow {
 namespace io {
+RecordWriterOptions RecordWriterOptions::CreateRecordWriterOptions(
+    const string& compression_type) {
+  RecordWriterOptions options;
+  if (compression_type == "ZLIB") {
+    options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
+#if defined(IS_SLIM_BUILD)
+    LOG(ERROR) << "Compression is not supported but compression_type is set."
+               << " No compression will be used.";
+#else
+    options.zlib_options = io::ZlibCompressionOptions::DEFAULT();
+#endif  // IS_SLIM_BUILD
+  } else if (compression_type == "GZIP") {
+    options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
+#if defined(IS_SLIM_BUILD)
+    LOG(ERROR) << "Compression is not supported but compression_type is set."
+               << " No compression will be used.";
+#else
+    options.zlib_options = io::ZlibCompressionOptions::GZIP();
+#endif  // IS_SLIM_BUILD
+  } else if (compression_type != "") {
+    LOG(ERROR) << "Unsupported compression_type:" << compression_type
+               << ". No comprression will be used.";
+  }
+  return options;
+}
 
 RecordWriter::RecordWriter(WritableFile* dest,
                            const RecordWriterOptions& options)
diff --git a/tensorflow/core/lib/io/record_writer.h b/tensorflow/core/lib/io/record_writer.h
index 3d42a281de9..5a2373d7570 100644
--- a/tensorflow/core/lib/io/record_writer.h
+++ b/tensorflow/core/lib/io/record_writer.h
@@ -36,6 +36,9 @@ class RecordWriterOptions {
   enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
   CompressionType compression_type = NONE;
 
+  static RecordWriterOptions CreateRecordWriterOptions(
+      const string& compression_type);
+
 // Options specific to zlib compression.
 #if !defined(IS_SLIM_BUILD)
   ZlibCompressionOptions zlib_options;
diff --git a/tensorflow/core/lib/io/zlib_compression_options.h b/tensorflow/core/lib/io/zlib_compression_options.h
index 95af0ab9c94..b1e58758431 100644
--- a/tensorflow/core/lib/io/zlib_compression_options.h
+++ b/tensorflow/core/lib/io/zlib_compression_options.h
@@ -16,13 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
 #define TENSORFLOW_LIB_IO_ZLIB_COMPRESSION_OPTIONS_H_
 
-// TODO(srbs|vrv): Move to a platform/zlib.h file to centralize all
-// platform-specific includes
-#ifdef __ANDROID__
-#include "zlib.h"
-#else
 #include <zlib.h>
-#endif  // __ANDROID__
 
 namespace tensorflow {
 namespace io {
diff --git a/tensorflow/core/lib/io/zlib_inputstream.h b/tensorflow/core/lib/io/zlib_inputstream.h
index c79c04d9535..31fa3c7dd0d 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.h
+++ b/tensorflow/core/lib/io/zlib_inputstream.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_IO_ZLIB_INPUTSTREAM_H_
 #define TENSORFLOW_LIB_IO_ZLIB_INPUTSTREAM_H_
 
+#include <zlib.h>
+
 #include <string>
+
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/inputstream_interface.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
@@ -24,14 +27,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-// TODO(srbs|vrv): Move to a platform/zlib.h file to centralize all
-// platform-specific includes
-#ifdef __ANDROID__
-#include "zlib.h"
-#else
-#include <zlib.h>
-#endif  // __ANDROID__
-
 namespace tensorflow {
 namespace io {
 
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.h b/tensorflow/core/lib/io/zlib_outputbuffer.h
index a53c40b8fbc..a33472cfc53 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.h
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.h
@@ -16,21 +16,16 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
 
+#include <zlib.h>
+
 #include <string>
+
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-// TODO(srbs|vrv): Move to a platform/zlib.h file to centralize all
-// platform-specific includes.
-#ifdef __ANDROID__
-#include "zlib.h"
-#else
-#include <zlib.h>
-#endif  // __ANDROID__
-
 namespace tensorflow {
 namespace io {
 
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index a48811053a5..dc29dc1eaa9 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
 #define TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
 
+#define _USE_MATH_DEFINES
+#include <cmath>
+#undef _USE_MATH_DEFINES
+
 #include <math.h>
 #include <string.h>
 #include <algorithm>
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index de7bd30a47b..797233e746d 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <stdlib.h>
 #include <algorithm>
 #include <cmath>
+#include <locale>
 #include <unordered_map>
 
 #include "tensorflow/core/lib/strings/stringprintf.h"
diff --git a/tensorflow/core/lib/strings/scanner.cc b/tensorflow/core/lib/strings/scanner.cc
index 4105f8bef0a..39a2265aa27 100644
--- a/tensorflow/core/lib/strings/scanner.cc
+++ b/tensorflow/core/lib/strings/scanner.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace tensorflow {
 namespace strings {
 
-void Scanner::ScanEscapedUntilImpl(char end_ch) {
+void Scanner::ScanUntilImpl(char end_ch, bool escaped) {
   for (;;) {
     if (cur_.empty()) {
       Error();
@@ -30,7 +30,7 @@ void Scanner::ScanEscapedUntilImpl(char end_ch) {
     }
 
     cur_.remove_prefix(1);
-    if (ch == '\\') {
+    if (escaped && ch == '\\') {
       // Escape character, skip next character.
       if (cur_.empty()) {
         Error();
diff --git a/tensorflow/core/lib/strings/scanner.h b/tensorflow/core/lib/strings/scanner.h
index 32c0d6d4a51..8bcf05104cb 100644
--- a/tensorflow/core/lib/strings/scanner.h
+++ b/tensorflow/core/lib/strings/scanner.h
@@ -127,11 +127,17 @@ class Scanner {
   // Shorthand for Any(SPACE).
   Scanner& AnySpace() { return Any(SPACE); }
 
+  // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
+  Scanner& ScanUntil(char end_ch) {
+    ScanUntilImpl(end_ch, false);
+    return *this;
+  }
+
   // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
   // Backslash escape sequences are skipped.
   // Used for implementing quoted string scanning.
   Scanner& ScanEscapedUntil(char end_ch) {
-    ScanEscapedUntilImpl(end_ch);
+    ScanUntilImpl(end_ch, true);
     return *this;
   }
 
@@ -154,7 +160,7 @@ class Scanner {
                  StringPiece* capture = nullptr);
 
  private:
-  void ScanEscapedUntilImpl(char end_ch);
+  void ScanUntilImpl(char end_ch, bool escaped);
 
   Scanner& Error() {
     error_ = true;
diff --git a/tensorflow/core/lib/strings/scanner_test.cc b/tensorflow/core/lib/strings/scanner_test.cc
index 491d38e78ee..0d37b100703 100644
--- a/tensorflow/core/lib/strings/scanner_test.cc
+++ b/tensorflow/core/lib/strings/scanner_test.cc
@@ -118,6 +118,33 @@ TEST_F(ScannerTest, OneLiteral) {
   EXPECT_TRUE(Scanner("abc").OneLiteral("ab").OneLiteral("c").GetResult());
 }
 
+TEST_F(ScannerTest, ScanUntil) {
+  StringPiece remaining, match;
+  EXPECT_TRUE(Scanner(R"(' \1 \2 \3 \' \\'rest)")
+                  .OneLiteral("'")
+                  .ScanUntil('\'')
+                  .OneLiteral("'")
+                  .GetResult(&remaining, &match));
+  EXPECT_EQ(R"( \\'rest)", remaining.ToString());
+  EXPECT_EQ(R"(' \1 \2 \3 \')", match.ToString());
+
+  // The "scan until" character is not present.
+  remaining = match = "unset";
+  EXPECT_FALSE(Scanner(R"(' \1 \2 \3 \\rest)")
+                   .OneLiteral("'")
+                   .ScanUntil('\'')
+                   .GetResult(&remaining, &match));
+  EXPECT_EQ("unset", remaining.ToString());
+  EXPECT_EQ("unset", match.ToString());
+
+  // Scan until an escape character.
+  remaining = match = "";
+  EXPECT_TRUE(
+      Scanner(R"(123\456)").ScanUntil('\\').GetResult(&remaining, &match));
+  EXPECT_EQ(R"(\456)", remaining.ToString());
+  EXPECT_EQ("123", match.ToString());
+}
+
 TEST_F(ScannerTest, ScanEscapedUntil) {
   StringPiece remaining, match;
   EXPECT_TRUE(Scanner(R"(' \1 \2 \3 \' \\'rest)")
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index 288b5994415..135d2f744e2 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -162,7 +162,7 @@ inline std::vector<string> Split(StringPiece text, char delim) {
 template <typename Predicate>
 std::vector<string> Split(StringPiece text, char delim, Predicate p) {
   std::vector<string> result;
-  int token_start = 0;
+  size_t token_start = 0;
   if (!text.empty()) {
     for (size_t i = 0; i < text.size() + 1; i++) {
       if ((i == text.size()) || (text[i] == delim)) {
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 6bdfdc2639e..1ae37819166 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -2567,6 +2567,317 @@ out: 1-D. Values present in `x` but not in `y`.
 idx: 1-D. Positions of `x` values preserved in `out`.
 )doc");
 
+namespace {
+
+// Converts Tensor to flat std::vector<int64>.
+template <typename InputType>
+std::vector<int64> GetFlatInt64(const Tensor& t) {
+  std::vector<int64> output(t.shape().num_elements());
+  auto eigen_vec = t.flat<InputType>();
+  std::copy_n(&eigen_vec(0), output.size(), output.begin());
+  return output;
+}
+
+// Converts int32 or int64 Tensor to flat std::vector<int64>.
+std::vector<int64> GetFlatInt64(const Tensor& t) {
+  if (t.dtype() == DT_INT32) {
+    return GetFlatInt64<int32>(t);
+  } else {
+    return GetFlatInt64<int64>(t);
+  }
+}
+
+Status SpaceToBatchShapeHelper(InferenceContext* c, ShapeHandle input_shape,
+                               ShapeHandle block_shape_shape,
+                               const Tensor* block_shape_t,
+                               ShapeHandle paddings_shape,
+                               const Tensor* paddings_t) {
+  if (c->Rank(block_shape_shape) != 1) {
+    return errors::InvalidArgument("block_shape must have rank 1.");
+  }
+
+  const DimensionHandle num_block_dims_handle = c->Dim(block_shape_shape, 0);
+  if (!c->ValueKnown(num_block_dims_handle)) {
+    return errors::InvalidArgument("block_shape must have known size.");
+  }
+
+  const int64 num_block_dims = c->Value(num_block_dims_handle);
+
+  TF_RETURN_IF_ERROR(
+      c->WithRankAtLeast(input_shape, num_block_dims + 1, &input_shape));
+
+  TF_RETURN_IF_ERROR(
+      c->Merge(paddings_shape, c->Matrix(num_block_dims, 2), &paddings_shape));
+
+  DimensionHandle batch_size = c->Dim(input_shape, 0);
+  std::vector<int64> block_shape_vec;
+  if (block_shape_t) {
+    block_shape_vec = GetFlatInt64(*block_shape_t);
+    for (int64 dim = 0; dim < num_block_dims; ++dim) {
+      const int64 block_shape_value = block_shape_vec[dim];
+      if (block_shape_value < 1) {
+        return errors::InvalidArgument("block_shape must be positive");
+      }
+      if (c->ValueKnown(batch_size)) {
+        TF_RETURN_IF_ERROR(
+            c->Multiply(batch_size, block_shape_value, &batch_size));
+      } else {
+        batch_size = c->UnknownDim();
+      }
+    }
+  } else if (num_block_dims > 0) {
+    batch_size = c->UnknownDim();
+  }
+
+  std::vector<DimensionHandle> output_dims{batch_size};
+  output_dims.resize(num_block_dims + 1, c->UnknownDim());
+
+  if (paddings_t) {
+    const std::vector<int64> paddings_vec = GetFlatInt64(*paddings_t);
+    for (int64 dim = 0; dim < num_block_dims; ++dim) {
+      const int64 pad_start = paddings_vec[dim * 2],
+                  pad_end = paddings_vec[dim * 2 + 1];
+      if (pad_start < 0 || pad_end < 0) {
+        return errors::InvalidArgument("paddings cannot be negative");
+      }
+      if (block_shape_t) {
+        DimensionHandle padded_size;
+        TF_RETURN_IF_ERROR(
+            c->Add(c->Dim(input_shape, dim + 1), pad_start, &padded_size));
+        TF_RETURN_IF_ERROR(c->Add(padded_size, pad_end, &padded_size));
+        TF_RETURN_IF_ERROR(c->Divide(padded_size, block_shape_vec[dim],
+                                     /*evenly_divisible=*/true,
+                                     &output_dims[dim + 1]));
+      }
+    }
+  }
+
+  ShapeHandle remaining_input_shape;
+  TF_RETURN_IF_ERROR(
+      c->Subshape(input_shape, 1 + num_block_dims, &remaining_input_shape));
+
+  ShapeHandle result;
+  TF_RETURN_IF_ERROR(c->Concatenate(c->MakeShape(output_dims),
+                                    remaining_input_shape, &result));
+  c->set_output(0, result);
+  return Status::OK();
+}
+
+Status BatchToSpaceShapeHelper(InferenceContext* c, ShapeHandle input_shape,
+                               ShapeHandle block_shape_shape,
+                               const Tensor* block_shape_t,
+                               ShapeHandle crops_shape, const Tensor* crops_t) {
+  if (c->Rank(block_shape_shape) != 1) {
+    return errors::InvalidArgument("block_shape must have rank 1.");
+  }
+
+  const DimensionHandle num_block_dims_handle = c->Dim(block_shape_shape, 0);
+  if (!c->ValueKnown(num_block_dims_handle)) {
+    return errors::InvalidArgument("block_shape must have known size.");
+  }
+
+  const int64 num_block_dims = c->Value(num_block_dims_handle);
+
+  TF_RETURN_IF_ERROR(
+      c->WithRankAtLeast(input_shape, num_block_dims + 1, &input_shape));
+
+  TF_RETURN_IF_ERROR(
+      c->Merge(crops_shape, c->Matrix(num_block_dims, 2), &crops_shape));
+
+  DimensionHandle batch_size = c->Dim(input_shape, 0);
+  std::vector<int64> block_shape_vec;
+  if (block_shape_t) {
+    block_shape_vec = GetFlatInt64(*block_shape_t);
+    for (int64 dim = 0; dim < num_block_dims; ++dim) {
+      const int64 block_shape_value = block_shape_vec[dim];
+      if (block_shape_value < 1) {
+        return errors::InvalidArgument("block_shape must be positive");
+      }
+      if (c->ValueKnown(batch_size)) {
+        TF_RETURN_IF_ERROR(c->Divide(batch_size, block_shape_value,
+                                     /*evenly_divisible=*/true, &batch_size));
+      } else {
+        batch_size = c->UnknownDim();
+      }
+    }
+  } else if (num_block_dims > 0) {
+    batch_size = c->UnknownDim();
+  }
+
+  std::vector<DimensionHandle> output_dims{batch_size};
+  output_dims.resize(num_block_dims + 1, c->UnknownDim());
+
+  if (crops_t) {
+    const std::vector<int64> crops_vec = GetFlatInt64(*crops_t);
+    for (int64 dim = 0; dim < num_block_dims; ++dim) {
+      const int64 crop_start = crops_vec[dim * 2],
+                  crop_end = crops_vec[dim * 2 + 1];
+      if (crop_start < 0 || crop_end < 0) {
+        return errors::InvalidArgument("crops cannot be negative");
+      }
+      if (block_shape_t) {
+        DimensionHandle cropped_size;
+        TF_RETURN_IF_ERROR(c->Multiply(c->Dim(input_shape, dim + 1),
+                                       block_shape_vec[dim], &cropped_size));
+        TF_RETURN_IF_ERROR(
+            c->Subtract(cropped_size, crop_start, &cropped_size));
+        TF_RETURN_IF_ERROR(
+            c->Subtract(cropped_size, crop_end, &output_dims[dim + 1]));
+      }
+    }
+  }
+
+  ShapeHandle remaining_input_shape;
+  TF_RETURN_IF_ERROR(
+      c->Subshape(input_shape, 1 + num_block_dims, &remaining_input_shape));
+
+  ShapeHandle result;
+  TF_RETURN_IF_ERROR(c->Concatenate(c->MakeShape(output_dims),
+                                    remaining_input_shape, &result));
+  c->set_output(0, result);
+  return Status::OK();
+}
+
+}  // namespace
+
+// --------------------------------------------------------------------------
+REGISTER_OP("SpaceToBatchND")
+    .Input("input: T")
+    .Input("block_shape: Tblock_shape")
+    .Input("paddings: Tpaddings")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tblock_shape: {int32, int64} = DT_INT32")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      return SpaceToBatchShapeHelper(c, c->input(0), c->input(1),
+                                     c->input_tensor(1), c->input(2),
+                                     c->input_tensor(2));
+    })
+    .Doc(R"doc(
+SpaceToBatch for N-D tensors of type T.
+
+This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+grid of blocks of shape `block_shape`, and interleaves these blocks with the
+"batch" dimension (0) such that in the output, the spatial dimensions
+`[1, ..., M]` correspond to the position within the grid, and the batch
+dimension combines both the position within a spatial block and the original
+batch position.  Prior to division into blocks, the spatial dimensions of the
+input are optionally zero padded according to `paddings`.  See below for a
+precise description.
+
+input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+  where spatial_shape has `M` dimensions.
+
+block_shape: 1-D with shape `[M]`, all values must be >= 1.
+
+paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
+  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+  `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
+
+This operation is equivalent to the following steps:
+
+1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+   input according to `paddings` to produce `padded` of shape `padded_shape`.
+
+2. Reshape `padded` to `reshaped_padded` of shape:
+     [batch] +
+     [padded_shape[1] / block_shape[0],
+       block_shape[0],
+      ...,
+      padded_shape[M] / block_shape[M-1],
+      block_shape[M-1]] +
+     remaining_shape
+
+3. Permute dimensions of `reshaped_padded` to produce
+   `permuted_reshaped_padded` of shape:
+     block_shape +
+     [batch] +
+     [padded_shape[1] / block_shape[0],
+      ...,
+      padded_shape[M] / block_shape[M-1]] +
+     remaining_shape
+
+4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+   dimension, producing an output tensor of shape:
+     [batch * prod(block_shape)] +
+     [padded_shape[1] / block_shape[0],
+      ...,
+      padded_shape[M] / block_shape[M-1]] +
+     remaining_shape
+
+Some examples:
+
+(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+    `paddings = [[0, 0], [0, 0]]`:
+
+```prettyprint
+x = [[[[1], [2]], [[3], [4]]]]
+```
+
+The output tensor has shape `[4, 1, 1, 1]` and value:
+
+```prettyprint
+[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+```
+
+(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+    `paddings = [[0, 0], [0, 0]]`:
+
+```prettyprint
+x = [[[[1, 2, 3], [4, 5, 6]],
+      [[7, 8, 9], [10, 11, 12]]]]
+```
+
+The output tensor has shape `[4, 1, 1, 3]` and value:
+
+```prettyprint
+[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+```
+
+(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+    `paddings = [[0, 0], [0, 0]]`:
+
+```prettyprint
+x = [[[[1],   [2],  [3],  [4]],
+      [[5],   [6],  [7],  [8]],
+      [[9],  [10], [11],  [12]],
+      [[13], [14], [15],  [16]]]]
+```
+
+The output tensor has shape `[4, 2, 2, 1]` and value:
+
+```prettyprint
+x = [[[[1], [3]], [[5], [7]]],
+     [[[2], [4]], [[10], [12]]],
+     [[[5], [7]], [[13], [15]]],
+     [[[6], [8]], [[14], [16]]]]
+```
+
+(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+    paddings = `[[0, 0], [2, 0]]`:
+
+```prettyprint
+x = [[[[1],   [2],  [3],  [4]],
+      [[5],   [6],  [7],  [8]]],
+     [[[9],  [10], [11],  [12]],
+      [[13], [14], [15],  [16]]]]
+```
+
+The output tensor has shape `[8, 1, 3, 1]` and value:
+
+```prettyprint
+x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+     [[[0], [2], [4]]], [[[0], [10], [12]]],
+     [[[0], [5], [7]]], [[[0], [13], [15]]],
+     [[[0], [6], [8]]], [[[0], [14], [16]]]]
+```
+
+Among others, this operation is useful for reducing atrous convolution into
+regular convolution.
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("SpaceToBatch")
     .Input("input: T")
@@ -2576,81 +2887,26 @@ REGISTER_OP("SpaceToBatch")
     .Attr("Tpaddings: {int32, int64} = DT_INT32")
     .Attr("block_size: int >= 2")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle input;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
-
-      ShapeHandle paddings;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &paddings));
-
-      DimensionHandle pad0_dim = c->Dim(paddings, 0);
-      DimensionHandle pad1_dim = c->Dim(paddings, 1);
-
-      if (!c->ValueKnown(pad0_dim) || !c->ValueKnown(pad1_dim)) {
-        c->set_output(0, c->UnknownShapeOfRank(4));
-        return Status::OK();
-      }
-
-      int64 pad0 = c->Value(pad0_dim);
-      int64 pad1 = c->Value(pad1_dim);
-      if (pad0 != 2 || pad1 != 2) {
-        return errors::InvalidArgument(
-            "SpaceToBatch requires paddings with shape [2,2].");
-      }
+      ShapeHandle input_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
 
       int32 block_size;
       TF_RETURN_IF_ERROR(c->GetAttr("block_size", &block_size));
 
-      DimensionHandle output_height;
-      DimensionHandle output_width;
+      Tensor block_shape(tensorflow::DT_INT64, TensorShape({2}));
+      auto block_shape_vec = block_shape.vec<int64>();
+      block_shape_vec(0) = block_size;
+      block_shape_vec(1) = block_size;
 
-      const Tensor* paddings_t = c->input_tensor(1);
-      if (paddings_t == nullptr) {
-        output_height = c->UnknownDim();
-        output_width = c->UnknownDim();
-      } else {
-        int64 pad_top, pad_bottom, pad_left, pad_right;
-        if (paddings_t->dtype() == DT_INT32) {
-          auto pad_matrix = paddings_t->matrix<int32>();
-          pad_top = pad_matrix(0, 0);
-          pad_bottom = pad_matrix(0, 1);
-          pad_left = pad_matrix(1, 0);
-          pad_right = pad_matrix(1, 1);
-        } else {
-          auto pad_matrix = paddings_t->matrix<int64>();
-          pad_top = pad_matrix(0, 0);
-          pad_bottom = pad_matrix(0, 1);
-          pad_left = pad_matrix(1, 0);
-          pad_right = pad_matrix(1, 1);
-        }
-
-        if (pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0) {
-          return errors::InvalidArgument("Paddings cannot be negative.");
-        }
-
-        TF_RETURN_IF_ERROR(
-            c->Add(c->Dim(input, 1), pad_top + pad_bottom, &output_height));
-        TF_RETURN_IF_ERROR(
-            c->Add(c->Dim(input, 2), pad_left + pad_right, &output_width));
-      }
-
-      DimensionHandle batch;
-      TF_RETURN_IF_ERROR(
-          c->Multiply(c->Dim(input, 0), block_size * block_size, &batch));
-
-      // Will return an error if block_size does not evenly divide.
-      TF_RETURN_IF_ERROR(c->Divide(output_height, block_size,
-                                   true /* evenly_divisible */,
-                                   &output_height));
-      TF_RETURN_IF_ERROR(c->Divide(output_width, block_size,
-                                   true /* evenly_divisible */, &output_width));
-
-      c->set_output(0, c->MakeShape({batch, output_height, output_width,
-                                     c->Dim(input, 3)}));
-      return Status::OK();
+      return SpaceToBatchShapeHelper(c, input_shape, c->MakeShape({2}),
+                                     &block_shape, c->input(1),
+                                     c->input_tensor(1));
     })
     .Doc(R"doc(
 SpaceToBatch for 4-D tensors of type T.
 
+This is a legacy version of the more general SpaceToBatchND.
+
 Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
 More specifically, this op outputs a copy of the input tensor where values from
 the `height` and `width` dimensions are moved to the `batch` dimension. After
@@ -2746,6 +3002,146 @@ Among others, this operation is useful for reducing atrous convolution into
 regular convolution.
 )doc");
 
+// --------------------------------------------------------------------------
+REGISTER_OP("BatchToSpaceND")
+    .Input("input: T")
+    .Input("block_shape: Tblock_shape")
+    .Input("crops: Tcrops")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tblock_shape: {int32, int64} = DT_INT32")
+    .Attr("Tcrops: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      return BatchToSpaceShapeHelper(c, c->input(0), c->input(1),
+                                     c->input_tensor(1), c->input(2),
+                                     c->input_tensor(2));
+    })
+    .Doc(R"doc(
+BatchToSpace for N-D tensors of type T.
+
+This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+`block_shape + [batch]`, interleaves these blocks back into the grid defined by
+the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+the input.  The spatial dimensions of this intermediate result are then
+optionally cropped according to `crops` to produce the output.  This is the
+reverse of SpaceToBatch.  See below for a precise description.
+
+input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+  where spatial_shape has M dimensions.
+
+block_shape: 1-D with shape `[M]`, all values must be >= 1.
+
+crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+  required that
+  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+
+This operation is equivalent to the following steps:
+
+1. Reshape `input` to `reshaped` of shape:
+     [block_shape[0], ..., block_shape[M-1],
+      batch / prod(block_shape),
+      input_shape[1], ..., input_shape[N-1]]
+
+2. Permute dimensions of `reshaped` to produce `permuted` of shape
+     [batch / prod(block_shape),
+
+      input_shape[1], block_shape[0],
+      ...,
+      input_shape[M], block_shape[M-1],
+
+      input_shape[M+1], ..., input_shape[N-1]]
+
+3. Reshape `permuted` to produce `reshaped_permuted` of shape
+     [batch / prod(block_shape),
+
+      input_shape[1] * block_shape[0],
+      ...,
+      input_shape[M] * block_shape[M-1],
+
+      input_shape[M+1],
+      ...,
+      input_shape[N-1]]
+
+4. Crop the start and end of dimensions `[1, ..., M]` of
+   `reshaped_permuted` according to `crops` to produce the output of shape:
+     [batch / prod(block_shape),
+
+      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+      ...,
+      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+
+      input_shape[M+1], ..., input_shape[N-1]]
+
+Some examples:
+
+(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+    `crops = [[0, 0], [0, 0]]`:
+
+```prettyprint
+[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+```
+
+The output tensor has shape `[1, 2, 2, 1]` and value:
+
+```prettyprint
+x = [[[[1], [2]], [[3], [4]]]]
+```
+
+(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+    `crops = [[0, 0], [0, 0]]`:
+
+```prettyprint
+[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+```
+
+The output tensor has shape `[1, 2, 2, 3]` and value:
+
+```prettyprint
+x = [[[[1, 2, 3], [4, 5, 6]],
+      [[7, 8, 9], [10, 11, 12]]]]
+```
+
+(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+    `crops = [[0, 0], [0, 0]]`:
+
+```prettyprint
+x = [[[[1], [3]], [[5], [7]]],
+     [[[2], [4]], [[10], [12]]],
+     [[[5], [7]], [[13], [15]]],
+     [[[6], [8]], [[14], [16]]]]
+```
+
+The output tensor has shape `[1, 4, 4, 1]` and value:
+
+```prettyprint
+x = [[[1],   [2],  [3],  [4]],
+     [[5],   [6],  [7],  [8]],
+     [[9],  [10], [11],  [12]],
+     [[13], [14], [15],  [16]]]
+```
+
+(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+    `crops = [[0, 0], [2, 0]]`:
+
+```prettyprint
+x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+     [[[0], [2], [4]]], [[[0], [10], [12]]],
+     [[[0], [5], [7]]], [[[0], [13], [15]]],
+     [[[0], [6], [8]]], [[[0], [14], [16]]]]
+```
+
+The output tensor has shape `[2, 2, 4, 1]` and value:
+
+```prettyprint
+x = [[[[1],   [2],  [3],  [4]],
+      [[5],   [6],  [7],  [8]]],
+     [[[9],  [10], [11],  [12]],
+      [[13], [14], [15],  [16]]]]
+```
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("BatchToSpace")
     .Input("input: T")
@@ -2755,81 +3151,26 @@ REGISTER_OP("BatchToSpace")
     .Attr("block_size: int >= 2")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle input;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
-
-      ShapeHandle crops;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &crops));
-
-      DimensionHandle crops0_dim = c->Dim(crops, 0);
-      DimensionHandle crops1_dim = c->Dim(crops, 1);
-
-      if (!c->ValueKnown(crops0_dim) || !c->ValueKnown(crops1_dim)) {
-        c->set_output(0, c->UnknownShapeOfRank(4));
-        return Status::OK();
-      }
-
-      int64 crops0 = c->Value(crops0_dim);
-      int64 crops1 = c->Value(crops1_dim);
-      if (crops0 != 2 || crops1 != 2) {
-        return errors::InvalidArgument(
-            "BatchToSpace requires crops with shape [2,2].");
-      }
+      ShapeHandle input_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
 
       int32 block_size;
       TF_RETURN_IF_ERROR(c->GetAttr("block_size", &block_size));
 
-      DimensionHandle batch;
-      // Will return an error if does not evenly divide
-      TF_RETURN_IF_ERROR(c->Divide(c->Dim(input, 0), block_size * block_size,
-                                   true /* evenly_divisible */, &batch));
+      Tensor block_shape(tensorflow::DT_INT64, TensorShape({2}));
+      auto block_shape_vec = block_shape.vec<int64>();
+      block_shape_vec(0) = block_size;
+      block_shape_vec(1) = block_size;
 
-      DimensionHandle output_height;
-      DimensionHandle output_width;
-
-      const Tensor* crops_t = c->input_tensor(1);
-      if (crops_t == nullptr) {
-        output_height = c->UnknownDim();
-        output_width = c->UnknownDim();
-      } else {
-        int64 crops_top, crops_bottom, crops_left, crops_right;
-        if (crops_t->dtype() == DT_INT32) {
-          auto crops_matrix = crops_t->matrix<int32>();
-          crops_top = crops_matrix(0, 0);
-          crops_bottom = crops_matrix(0, 1);
-          crops_left = crops_matrix(1, 0);
-          crops_right = crops_matrix(1, 1);
-        } else {
-          auto crops_matrix = crops_t->matrix<int64>();
-          crops_top = crops_matrix(0, 0);
-          crops_bottom = crops_matrix(0, 1);
-          crops_left = crops_matrix(1, 0);
-          crops_right = crops_matrix(1, 1);
-        }
-
-        if (crops_top < 0 || crops_bottom < 0 || crops_left < 0 ||
-            crops_right < 0) {
-          return errors::InvalidArgument("Croppings cannot be negative.");
-        }
-
-        TF_RETURN_IF_ERROR(
-            c->Multiply(c->Dim(input, 1), block_size, &output_height));
-        TF_RETURN_IF_ERROR(c->Subtract(
-            output_height, (crops_top + crops_bottom), &output_height));
-
-        TF_RETURN_IF_ERROR(
-            c->Multiply(c->Dim(input, 2), block_size, &output_width));
-        TF_RETURN_IF_ERROR(c->Subtract(output_width, (crops_left + crops_right),
-                                       &output_width));
-      }
-
-      c->set_output(0, c->MakeShape({batch, output_height, output_width,
-                                     c->Dim(input, 3)}));
-      return Status::OK();
+      return BatchToSpaceShapeHelper(c, input_shape, c->MakeShape({2}),
+                                     &block_shape, c->input(1),
+                                     c->input_tensor(1));
     })
     .Doc(R"doc(
 BatchToSpace for 4-D tensors of type T.
 
+This is a legacy version of the more general BatchToSpaceND.
+
 Rearranges (permutes) data from batch into blocks of spatial data, followed by
 cropping. This is the reverse transformation of SpaceToBatch. More specifically,
 this op outputs a copy of the input tensor where values from the `batch`
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index afbff68126d..3cb0f742c1f 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -969,13 +969,12 @@ TEST(ArrayOpsTest, SpaceToBatch_ShapeFn) {
   // Paddings not known, but batch size can be computed.
   INFER_OK(op, "[1,10,10,3];[2,2]", "[4,?,?,d0_3]");
 
-  // Unknown paddings means unknown shape of rank 4.
-  INFER_OK(op, "[1,10,10,3];?", "[?,?,?,?]");
+  // Unknown paddings means width and height.
+  INFER_OK(op, "[1,10,10,3];?", "[4,?,?,d0_3]");
 
   // Paddings not correct shape
-  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1,10,10,3];[4]");
-  INFER_ERROR("SpaceToBatch requires paddings with shape [2,2]", op,
-              "[1,10,10,3];[2,3]");
+  INFER_ERROR("rank", op, "[1,10,10,3];[4]");
+  INFER_ERROR("3 and 2", op, "[1,10,10,3];[2,3]");
 
   Tensor paddings = test::AsTensor<int32>({4, 2, 2, 4}, {{2, 2}});
   op.input_tensors[1] = &paddings;
@@ -995,6 +994,83 @@ TEST(ArrayOpsTest, SpaceToBatch_ShapeFn) {
   INFER_ERROR("cannot be negative", op, "[1,10,10,3];[2,2]");
 }
 
+TEST(ArrayOpsTest, SpaceToBatchND_ShapeFn) {
+  ShapeInferenceTestOp op("SpaceToBatchND");
+  op.input_tensors.resize(3);
+  TF_ASSERT_OK(NodeDefBuilder("test", "SpaceToBatchND")
+                   .Input("input", 0, DT_FLOAT)
+                   .Input("block_shape", 1, DT_INT32)
+                   .Input("paddings", 2, DT_INT32)
+                   .Finalize(&op.node_def));
+
+  // Verify that input shape and paddings shape can be unknown.
+  INFER_OK(op, "?;[2];?", "?");
+
+  // Only number of input dimensions is known.
+  INFER_OK(op, "[?,?,?,?];[2];?", "[?,?,?,d0_3]");
+
+  // Dimensions are partially known.
+  INFER_OK(op, "[?,?,?,2];[2];?", "[?,?,?,d0_3]");
+
+  {
+    // Dimensions are partially known, block_shape known.
+    Tensor block_shape = test::AsTensor<int32>({2, 3});
+    op.input_tensors[1] = &block_shape;
+    INFER_OK(op, "[3,?,?,2];[2];?", "[18,?,?,d0_3]");
+
+    // Dimensions are partially known, block_shape and paddings known.
+    {
+      Tensor paddings = test::AsTensor<int32>({1, 1, 0, 1}, {{2, 2}});
+      op.input_tensors[2] = &paddings;
+      INFER_OK(op, "[3,?,2,2];[2];[2,2]", "[18,?,1,d0_3]");
+      op.input_tensors[2] = nullptr;
+    }
+
+    // Dimensions are fully known, block_shape and paddings are known.
+    {
+      Tensor paddings = test::AsTensor<int32>({1, 1, 0, 0}, {{2, 2}});
+      op.input_tensors[2] = &paddings;
+      INFER_OK(op, "[3,2,3,2];[2];[2,2]", "[18,2,1,d0_3]");
+      op.input_tensors[2] = nullptr;
+    }
+
+    op.input_tensors[1] = nullptr;
+  }
+
+  INFER_ERROR("block_shape must have rank 1", op, "?;[1,1];?");
+  INFER_ERROR("block_shape must have known size", op, "?;[?];?");
+
+  {
+    Tensor block_shape = test::AsTensor<int32>({0, 2});
+    op.input_tensors[1] = &block_shape;
+    INFER_ERROR("block_shape must be positive", op, "[1,2,2];[2];[2,2]");
+    op.input_tensors[1] = nullptr;
+  }
+
+  {
+    Tensor block_shape = test::AsTensor<int32>({1, 1});
+    op.input_tensors[1] = &block_shape;
+    Tensor paddings = test::AsTensor<int32>({0, -1, 0, 0}, {{2, 2}});
+    op.input_tensors[2] = &paddings;
+    INFER_ERROR("paddings cannot be negative", op, "[1,2,2];[2];[2,2]");
+    op.input_tensors[1] = nullptr;
+    op.input_tensors[2] = nullptr;
+  }
+
+  {
+    Tensor block_shape = test::AsTensor<int32>({3, 3});
+    op.input_tensors[1] = &block_shape;
+    Tensor paddings = test::AsTensor<int32>({0, 0, 0, 0}, {{2, 2}});
+    op.input_tensors[2] = &paddings;
+    INFER_ERROR("divisible", op, "[1,2,3,1];[2];[2,2]");
+    op.input_tensors[1] = nullptr;
+    op.input_tensors[2] = nullptr;
+  }
+
+  INFER_ERROR("rank", op, "[1,3,3,1];[2];[1]");
+  INFER_ERROR("shape", op, "[1,3,3,1];[2];[1,2]");
+}
+
 TEST(ArrayOpsTest, BatchToSpace_ShapeFn) {
   ShapeInferenceTestOp op("BatchToSpace");
   op.input_tensors.resize(2);
@@ -1008,16 +1084,15 @@ TEST(ArrayOpsTest, BatchToSpace_ShapeFn) {
   INFER_OK(op, "[4,8,8,3];[2,2]", "[1,?,?,d0_3]");
 
   // block_size not compatible with batch size
-  INFER_ERROR("Dimension size must be evenly divisible by 4 but is 5", op,
+  INFER_ERROR("Dimension size must be evenly divisible by", op,
               "[5,8,8,3];[2,2]");
 
-  // Unknown croppings means unknown shape
-  INFER_OK(op, "[4,8,8,3];?", "[?,?,?,?]");
+  // Unknown croppings means unknown width and height.
+  INFER_OK(op, "[4,8,8,3];?", "[1,?,?,d0_3]");
 
   // croppings not correct shape
-  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[4,8,8,3];[4]");
-  INFER_ERROR("BatchToSpace requires crops with shape [2,2]", op,
-              "[4,8,8,3];[2,3]");
+  INFER_ERROR("rank", op, "[4,8,8,3];[4]");
+  INFER_ERROR("3 and 2", op, "[4,8,8,3];[2,3]");
 
   Tensor croppings = test::AsTensor<int64>({4, 2, 2, 4}, {{2, 2}});
   op.input_tensors[1] = &croppings;
@@ -1039,6 +1114,90 @@ TEST(ArrayOpsTest, BatchToSpace_ShapeFn) {
   INFER_ERROR("cannot be negative", op, "[4,8,8,3];[2,2]");
 }
 
+TEST(ArrayOpsTest, BatchToSpaceND_ShapeFn) {
+  ShapeInferenceTestOp op("BatchToSpaceND");
+  op.input_tensors.resize(3);
+  TF_ASSERT_OK(NodeDefBuilder("test", "BatchToSpaceND")
+                   .Input("input", 0, DT_FLOAT)
+                   .Input("block_shape", 1, DT_INT32)
+                   .Input("crops", 2, DT_INT32)
+                   .Finalize(&op.node_def));
+
+  // Verify that input shape and crops shape can be unknown.
+  INFER_OK(op, "?;[2];?", "?");
+
+  // Only number of input dimensions is known.
+  INFER_OK(op, "[?,?,?,?];[2];?", "[?,?,?,d0_3]");
+
+  {
+    // Dimensions are partially known, block_shape known.
+    Tensor block_shape = test::AsTensor<int32>({2, 3});
+    op.input_tensors[1] = &block_shape;
+    INFER_OK(op, "[?,?,?,2];[2];?", "[?,?,?,d0_3]");
+
+    INFER_OK(op, "[18,?,?,2];[2];?", "[3,?,?,d0_3]");
+
+    // Dimensions are partially known, block_shape and crops known.
+    {
+      Tensor crops = test::AsTensor<int32>({1, 1, 0, 1}, {{2, 2}});
+      op.input_tensors[2] = &crops;
+      INFER_OK(op, "[18,?,2,2];[2];[2,2]", "[3,?,5,d0_3]");
+      op.input_tensors[2] = nullptr;
+    }
+
+    // Dimensions are fully known, block_shape and crops are known.
+    {
+      Tensor crops = test::AsTensor<int32>({1, 1, 0, 0}, {{2, 2}});
+      op.input_tensors[2] = &crops;
+      INFER_OK(op, "[18,2,1,2];[2];[2,2]", "[3,2,3,d0_3]");
+      op.input_tensors[2] = nullptr;
+    }
+
+    op.input_tensors[1] = nullptr;
+  }
+
+  INFER_ERROR("block_shape must have rank 1", op, "?;[1,1];?");
+  INFER_ERROR("block_shape must have known size", op, "?;[?];?");
+  INFER_ERROR("rank", op, "[2,2];[2];[2,2]");
+  INFER_ERROR("rank", op, "[2,2,3];[3];[3,2]");
+
+  {
+    Tensor block_shape = test::AsTensor<int32>({0, 2});
+    op.input_tensors[1] = &block_shape;
+    INFER_ERROR("block_shape must be positive", op, "[1,2,2];[2];[2,2]");
+    op.input_tensors[1] = nullptr;
+  }
+
+  {
+    Tensor block_shape = test::AsTensor<int32>({1, 1});
+    op.input_tensors[1] = &block_shape;
+    Tensor paddings = test::AsTensor<int32>({0, -1, 0, 0}, {{2, 2}});
+    op.input_tensors[2] = &paddings;
+    INFER_ERROR("crops cannot be negative", op, "[1,2,2];[2];[2,2]");
+    op.input_tensors[1] = nullptr;
+    op.input_tensors[2] = nullptr;
+  }
+
+  // The amount to crop exceeds the padded size.
+  {
+    Tensor block_shape = test::AsTensor<int32>({2, 2});
+    op.input_tensors[1] = &block_shape;
+    Tensor crops = test::AsTensor<int32>({3, 2, 0, 0}, {{2, 2}});
+    op.input_tensors[2] = &crops;
+    INFER_ERROR("Negative", op, "[4,2,3,1];[2];[2,2]");
+    op.input_tensors[1] = nullptr;
+    op.input_tensors[2] = nullptr;
+  }
+
+  // The batch size is not divisible by the product of the block_shape.
+  {
+    Tensor block_shape = test::AsTensor<int32>({2, 3});
+    op.input_tensors[1] = &block_shape;
+    INFER_ERROR("divisible", op, "[3,1,1,1];[2];[2,2]");
+    op.input_tensors[1] = nullptr;
+  }
+}
+
 TEST(ArrayOpsTest, SpaceToDepth_ShapeFn) {
   ShapeInferenceTestOp op("SpaceToDepth");
   TF_ASSERT_OK(NodeDefBuilder("test", "SpaceToDepth")
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index c1bcc0a9734..c2104dfea15 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -6080,6 +6080,55 @@ op {
     }
   }
 }
+op {
+  name: "BatchToSpaceND"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "crops"
+    type_attr: "Tcrops"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tcrops"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Betainc"
   input_arg {
@@ -24505,6 +24554,55 @@ op {
     minimum: 2
   }
 }
+op {
+  name: "SpaceToBatchND"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "SpaceToDepth"
   input_arg {
@@ -26076,6 +26174,50 @@ op {
     type: "type"
   }
 }
+op {
+  name: "SparseConcat"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "concat_dim"
+    type: "int"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "SparseDenseCwiseAdd"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index bd6d238da82..fe7c66b422e 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -2769,7 +2769,61 @@ op {
     }
   }
   summary: "BatchToSpace for 4-D tensors of type T."
-  description: "Rearranges (permutes) data from batch into blocks of spatial data, followed by\ncropping. This is the reverse transformation of SpaceToBatch. More specifically,\nthis op outputs a copy of the input tensor where values from the `batch`\ndimension are moved in spatial blocks to the `height` and `width` dimensions,\nfollowed by cropping along the `height` and `width` dimensions."
+  description: "This is a legacy version of the more general BatchToSpaceND.\n\nRearranges (permutes) data from batch into blocks of spatial data, followed by\ncropping. This is the reverse transformation of SpaceToBatch. More specifically,\nthis op outputs a copy of the input tensor where values from the `batch`\ndimension are moved in spatial blocks to the `height` and `width` dimensions,\nfollowed by cropping along the `height` and `width` dimensions."
+}
+op {
+  name: "BatchToSpaceND"
+  input_arg {
+    name: "input"
+    description: "N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,\nwhere spatial_shape has M dimensions."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    description: "1-D with shape `[M]`, all values must be >= 1."
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "crops"
+    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input\n  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is\n  required that\n  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.\n\nThis operation is equivalent to the following steps:\n\n1. Reshape `input` to `reshaped` of shape:\n     [block_shape[0], ..., block_shape[M-1],\n      batch / prod(block_shape),\n      input_shape[1], ..., input_shape[N-1]]\n\n2. Permute dimensions of `reshaped` to produce `permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1], block_shape[0],\n      ...,\n      input_shape[M], block_shape[M-1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\n3. Reshape `permuted` to produce `reshaped_permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0],\n      ...,\n      input_shape[M] * block_shape[M-1],\n\n      input_shape[M+1],\n      ...,\n      input_shape[N-1]]\n\n4. Crop the start and end of dimensions `[1, ..., M]` of\n   `reshaped_permuted` according to `crops` to produce the output of shape:\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],\n      ...,\n      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1], [3]], [[5], [7]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```prettyprint\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [2, 0]]`:\n\n```prettyprint\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```"
+    type_attr: "Tcrops"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tcrops"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "BatchToSpace for N-D tensors of type T."
+  description: "This operation reshapes the \"batch\" dimension 0 into `M + 1` dimensions of shape\n`block_shape + [batch]`, interleaves these blocks back into the grid defined by\nthe spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as\nthe input.  The spatial dimensions of this intermediate result are then\noptionally cropped according to `crops` to produce the output.  This is the\nreverse of SpaceToBatch.  See below for a precise description."
 }
 op {
   name: "Betainc"
@@ -5407,7 +5461,7 @@ op {
     }
   }
   summary: "Draw bounding boxes on a batch of images."
-  description: "Outputs a copy of `images` but draws on top of the pixels zero or more bounding\nboxes specified by the locations in `boxes`. The coordinates of the each\nbounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The\nbounding box coordinates are floats in `[0.0, 1.0]` relative to the width and\nheight of the underlying image.\n\nFor example, if an image is 100 x 200 pixels and the bounding box is\n`[0.1, 0.5, 0.2, 0.9]`, the bottom-left and upper-right coordinates of the\nbounding box will be `(10, 40)` to `(50, 180)`.\n\nParts of the bounding box may fall outside the image."
+  description: "Outputs a copy of `images` but draws on top of the pixels zero or more bounding\nboxes specified by the locations in `boxes`. The coordinates of the each\nbounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The\nbounding box coordinates are floats in `[0.0, 1.0]` relative to the width and\nheight of the underlying image.\n\nFor example, if an image is 100 x 200 pixels and the bounding box is\n`[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the\nbounding box will be `(10, 40)` to `(50, 180)`.\n\nParts of the bounding box may fall outside the image."
 }
 op {
   name: "DynamicPartition"
@@ -14023,7 +14077,61 @@ op {
     minimum: 2
   }
   summary: "SpaceToBatch for 4-D tensors of type T."
-  description: "Zero-pads and then rearranges (permutes) blocks of spatial data into batch.\nMore specifically, this op outputs a copy of the input tensor where values from\nthe `height` and `width` dimensions are moved to the `batch` dimension. After\nthe zero-padding, both `height` and `width` of the input must be divisible by the\nblock size."
+  description: "This is a legacy version of the more general SpaceToBatchND.\n\nZero-pads and then rearranges (permutes) blocks of spatial data into batch.\nMore specifically, this op outputs a copy of the input tensor where values from\nthe `height` and `width` dimensions are moved to the `batch` dimension. After\nthe zero-padding, both `height` and `width` of the input must be divisible by the\nblock size."
+}
+op {
+  name: "SpaceToBatchND"
+  input_arg {
+    name: "input"
+    description: "N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,\nwhere spatial_shape has `M` dimensions."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    description: "1-D with shape `[M]`, all values must be >= 1."
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "paddings"
+    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension\n  `i + 1`, which corresponds to spatial dimension `i`.  It is required that\n  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.\n\nThis operation is equivalent to the following steps:\n\n1. Zero-pad the start and end of dimensions `[1, ..., M]` of the\n   input according to `paddings` to produce `padded` of shape `padded_shape`.\n\n2. Reshape `padded` to `reshaped_padded` of shape:\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n       block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1],\n      block_shape[M-1]] +\n     remaining_shape\n\n3. Permute dimensions of `reshaped_padded` to produce\n   `permuted_reshaped_padded` of shape:\n     block_shape +\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\n4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch\n   dimension, producing an output tensor of shape:\n     [batch * prod(block_shape)] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]], [[5], [7]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and\n    paddings = `[[0, 0], [2, 0]]`:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 3, 1]` and value:\n\n```prettyprint\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "SpaceToBatch for N-D tensors of type T."
+  description: "This operation divides \"spatial\" dimensions `[1, ..., M]` of the input into a\ngrid of blocks of shape `block_shape`, and interleaves these blocks with the\n\"batch\" dimension (0) such that in the output, the spatial dimensions\n`[1, ..., M]` correspond to the position within the grid, and the batch\ndimension combines both the position within a spatial block and the original\nbatch position.  Prior to division into blocks, the spatial dimensions of the\ninput are optionally zero padded according to `paddings`.  See below for a\nprecise description."
 }
 op {
   name: "SpaceToDepth"
@@ -14961,8 +15069,7 @@ op {
   attr {
     name: "concat_dim"
     type: "int"
-    description: "Dimension to concatenate along."
-    has_minimum: true
+    description: "Dimension to concatenate along. Must be in range [-rank, rank),\nwhere rank is the number of dimensions in each input `SparseTensor`."
   }
   attr {
     name: "N"
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 14fcda9bd5f..a1d6b648e7a 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -358,7 +358,7 @@ REGISTER_OP("SparseConcat")
     .Output("output_indices: int64")
     .Output("output_values: T")
     .Output("output_shape: int64")
-    .Attr("concat_dim: int >= 0")
+    .Attr("concat_dim: int")
     .Attr("N: int >= 2")
     .Attr("T: type")
     .SetShapeFn([](InferenceContext* c) {
@@ -446,7 +446,8 @@ shapes: 1-D.  Shapes of each `SparseTensor`.
 output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
 output_values: 1-D.  Non-empty values of the concatenated `SparseTensor`.
 output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
-concat_dim: Dimension to concatenate along.
+concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+    where rank is the number of dimensions in each input `SparseTensor`.
 )doc");
 
 REGISTER_OP("SparseSplit")
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 22591bb1ecc..e40310d7e8e 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
 namespace tensorflow {
@@ -75,18 +74,14 @@ Status ParseGcsPath(StringPiece fname, string* bucket, string* object) {
   if (!bucket || !object) {
     return errors::Internal("bucket and object cannot be null.");
   }
-  if (!fname.Consume("gs://")) {
+  StringPiece scheme, bucketp, objectp;
+  ParseURI(fname, &scheme, &bucketp, &objectp);
+  if (scheme != "gs") {
     return errors::InvalidArgument("GCS path must start with gs://");
   }
-  auto first_slash = fname.find('/');
-  if (first_slash == string::npos) {
-    *bucket = fname.ToString();
-    *object = string();
-  } else {
-    *bucket = fname.substr(0, first_slash).ToString();
-    fname.remove_prefix(first_slash + 1);
-    *object = fname.ToString();
-  }
+  *bucket = bucketp.ToString();
+  objectp.Consume("/");
+  *object = objectp.ToString();
   return Status::OK();
 }
 
@@ -101,6 +96,71 @@ string MaybeAppendSlash(const string& name) {
   return name;
 }
 
+Status ParseJson(StringPiece json, Json::Value* result) {
+  Json::Reader reader;
+  if (!reader.parse(json.ToString(), *result)) {
+    return errors::Internal("Couldn't parse JSON response from GCS.");
+  }
+  return Status::OK();
+}
+
+/// Reads a JSON value with the given name from a parent JSON value.
+Status GetValue(const Json::Value& parent, const string& name,
+                Json::Value* result) {
+  *result = parent.get(name, Json::Value::null);
+  if (*result == Json::Value::null) {
+    return errors::Internal(strings::StrCat(
+        "The field '", name, "' was expected in the JSON response."));
+  }
+  return Status::OK();
+}
+
+/// Reads a string JSON value with the given name from a parent JSON value.
+Status GetStringValue(const Json::Value& parent, const string& name,
+                      string* result) {
+  Json::Value result_value;
+  TF_RETURN_IF_ERROR(GetValue(parent, name, &result_value));
+  if (!result_value.isString()) {
+    return errors::Internal(
+        strings::StrCat("The field '", name,
+                        "' in the JSON response was expected to be a string."));
+  }
+  *result = result_value.asString();
+  return Status::OK();
+}
+
+/// Reads a long JSON value with the given name from a parent JSON value.
+Status GetInt64Value(const Json::Value& parent, const string& name,
+                     int64* result) {
+  Json::Value result_value;
+  TF_RETURN_IF_ERROR(GetValue(parent, name, &result_value));
+  if (result_value.isNumeric()) {
+    *result = result_value.asInt64();
+    return Status::OK();
+  }
+  if (result_value.isString() &&
+      strings::safe_strto64(result_value.asString().c_str(), result)) {
+    return Status::OK();
+  }
+  return errors::Internal(
+      strings::StrCat("The field '", name,
+                      "' in the JSON response was expected to be a number."));
+}
+
+/// Reads a boolean JSON value with the given name from a parent JSON value.
+Status GetBoolValue(const Json::Value& parent, const string& name,
+                    bool* result) {
+  Json::Value result_value;
+  TF_RETURN_IF_ERROR(GetValue(parent, name, &result_value));
+  if (!result_value.isBool()) {
+    return errors::Internal(strings::StrCat(
+        "The field '", name,
+        "' in the JSON response was expected to be a boolean."));
+  }
+  *result = result_value.asBool();
+  return Status::OK();
+}
+
 /// A GCS-based implementation of a random access file with a read-ahead buffer.
 class GcsRandomAccessFile : public RandomAccessFile {
  public:
@@ -609,40 +669,17 @@ Status GcsFileSystem::StatForObject(const string& bucket, const string& object,
       request->SetResultBuffer(scratch.get(), kBufferSize, &response_piece));
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       request->Send(), " when reading metadata of gs://", bucket, "/", object);
-  std::stringstream response_stream;
-  response_stream << response_piece;
 
   Json::Value root;
-  Json::Reader reader;
-  if (!reader.parse(response_stream.str(), root)) {
-    return errors::Internal("Couldn't parse JSON response from GCS.");
-  }
+  TF_RETURN_IF_ERROR(ParseJson(response_piece, &root));
 
   // Parse file size.
-  const auto size = root.get("size", Json::Value::null);
-  if (size == Json::Value::null) {
-    return errors::Internal("'size' was expected in the JSON response.");
-  }
-  if (size.isNumeric()) {
-    stat->length = size.asUInt64();
-  } else if (size.isString()) {
-    if (!strings::safe_strto64(size.asString().c_str(), &(stat->length))) {
-      return errors::Internal("'size' couldn't be parsed as a nubmer.");
-    }
-  } else {
-    return errors::Internal("'size' is not a number in the JSON response.");
-  }
+  TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &(stat->length)));
 
   // Parse file modification time.
-  const auto updated = root.get("updated", Json::Value::null);
-  if (updated == Json::Value::null) {
-    return errors::Internal("'updated' was expected in the JSON response.");
-  }
-  if (!updated.isString()) {
-    return errors::Internal(
-        "'updated' is expected to be a string in the JSON response.");
-  }
-  TF_RETURN_IF_ERROR(ParseRfc3339Time(updated.asString(), &(stat->mtime_nsec)));
+  string updated;
+  TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
+  TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
 
   stat->is_directory = false;
 
@@ -714,13 +751,8 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
     TF_RETURN_IF_ERROR(
         request->SetResultBuffer(scratch.get(), kBufferSize, &response_piece));
     TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading ", dirname);
-    std::stringstream response_stream;
-    response_stream << response_piece;
     Json::Value root;
-    Json::Reader reader;
-    if (!reader.parse(response_stream.str(), root)) {
-      return errors::Internal("Couldn't parse JSON response from GCS.");
-    }
+    TF_RETURN_IF_ERROR(ParseJson(response_piece, &root));
     const auto items = root.get("items", Json::Value::null);
     if (items == Json::Value::null) {
       // Empty results.
@@ -735,20 +767,16 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         return errors::Internal(
             "Unexpected JSON format: 'items' should be a list of objects.");
       }
-      const auto name = item.get("name", Json::Value::null);
-      if (name == Json::Value::null || !name.isString()) {
-        return errors::Internal(
-            "Unexpected JSON format: 'items.name' is missing or not a string.");
-      }
+      string name;
+      TF_RETURN_IF_ERROR(GetStringValue(item, "name", &name));
       // The names should be relative to the 'dirname'. That means the
       // 'object_prefix', which is part of 'dirname', should be removed from the
       // beginning of 'name'.
-      const string& name_str = name.asString();
-      StringPiece relative_path(name_str);
+      StringPiece relative_path(name);
       if (!relative_path.Consume(object_prefix)) {
-        return errors::Internal(strings::StrCat(
-            "Unexpected response: the returned file name ", name_str,
-            " doesn't match the prefix ", object_prefix));
+        return errors::Internal(
+            strings::StrCat("Unexpected response: the returned file name ",
+                            name, " doesn't match the prefix ", object_prefix));
       }
       result->emplace_back(relative_path.ToString());
       if (++retrieved_results >= max_results) {
@@ -886,9 +914,28 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
       request->EscapeString(target_object))));
   TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
   TF_RETURN_IF_ERROR(request->SetPostEmptyBody());
+  std::unique_ptr<char[]> scratch(new char[kBufferSize]);
+  StringPiece response_piece;
+  TF_RETURN_IF_ERROR(
+      request->SetResultBuffer(scratch.get(), kBufferSize, &response_piece));
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when renaming ", src,
                                   " to ", target);
 
+  Json::Value root;
+  TF_RETURN_IF_ERROR(ParseJson(response_piece, &root));
+  bool done;
+  TF_RETURN_IF_ERROR(GetBoolValue(root, "done", &done));
+  if (!done) {
+    // If GCS didn't complete rewrite in one call, this means that a large file
+    // is being copied to a bucket with a different storage class or location,
+    // which requires multiple rewrite calls.
+    // TODO(surkov): implement multi-step rewrites.
+    return errors::Unimplemented(
+        strings::StrCat("Couldn't rename ", src, " to ", target,
+                        ": moving large files between buckets with different "
+                        "locations or storage classes is not supported."));
+  }
+
   TF_RETURN_IF_ERROR(DeleteFile(src));
   return Status::OK();
 }
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 826bb1bfba9..46e0a432f53 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -756,7 +756,7 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "path1%2F/rewriteTo/b/bucket/o/path2%2F\n"
            "Auth Token: fake_token\n"
            "Post: yes\n",
-           ""),
+           "{\"done\": true}"),
        // Deleting the original directory marker.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
@@ -771,7 +771,7 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "path2%2Fsubfolder%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
            "Post: yes\n",
-           ""),
+           "{\"done\": true}"),
        // Deleting the first original file.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
@@ -785,7 +785,7 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "path1%2Ffile2.txt/rewriteTo/b/bucket/o/path2%2Ffile2.txt\n"
            "Auth Token: fake_token\n"
            "Post: yes\n",
-           ""),
+           "{\"done\": true}"),
        // Deleting the second original file.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
@@ -823,7 +823,7 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
            "path%2Fsrc.txt/rewriteTo/b/bucket/o/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
            "Post: yes\n",
-           ""),
+           "{\"done\": true}"),
        // Deleting the original file.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
@@ -840,6 +840,41 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
       fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
 }
 
+/// Tests the case when rewrite couldn't complete in one RPC.
+TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
+  std::vector<HttpRequest*> requests(
+      {// IsDirectory is checking whether there are children objects.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+           "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
+           "&maxResults=1\n"
+           "Auth Token: fake_token\n",
+           "{}"),
+       // IsDirectory is checking if the path exists as an object.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n",
+           strings::StrCat("{\"size\": \"1010\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       // Copying to the new location.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt/rewriteTo/b/bucket/o/path%2Fdst.txt\n"
+           "Auth Token: fake_token\n"
+           "Post: yes\n",
+           "{\"done\": false}")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* read ahead bytes */, 5 /* max upload attempts */);
+
+  EXPECT_EQ(
+      errors::Code::UNIMPLEMENTED,
+      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt")
+          .code());
+}
+
 TEST(GcsFileSystemTest, Stat_Object) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
diff --git a/tensorflow/core/platform/cloud/http_request.cc b/tensorflow/core/platform/cloud/http_request.cc
index 4de066b482c..211a340941f 100644
--- a/tensorflow/core/platform/cloud/http_request.cc
+++ b/tensorflow/core/platform/cloud/http_request.cc
@@ -215,6 +215,8 @@ Status HttpRequest::Init() {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_TIMEOUT, kRequestTimeoutSeconds);
   libcurl_->curl_easy_setopt(curl_, CURLOPT_CONNECTTIMEOUT,
                              kConnectTimeoutSeconds);
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTP_VERSION,
+                             CURL_HTTP_VERSION_2_0);
 
   // If response buffer is not set, libcurl will print results to stdout,
   // so we always set it.
@@ -448,7 +450,10 @@ Status HttpRequest::Send() {
     case 204:  // No Content
     case 206:  // Partial Content
       if (curl_result != CURLE_OK) {
-        return errors::Internal(string("curl error: ") + error_buffer);
+        // UNAVAILABLE can be retried by the caller, e.g by RetryingFileSystem.
+        return errors::Unavailable(
+            strings::StrCat("libcurl failed with error code ", curl_result,
+                            ": ", error_buffer));
       }
       if (response_buffer_ && response_string_piece_) {
         *response_string_piece_ = StringPiece(response_buffer_, written_size);
@@ -465,6 +470,7 @@ Status HttpRequest::Send() {
       }
       return Status::OK();
     default:
+      // UNAVAILABLE can be retried by the caller, e.g by RetryingFileSystem.
       return errors::Unavailable(
           strings::StrCat("Unexpected response code ", response_code_));
   }
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index e1983a95437..661f77b3c4e 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -102,12 +102,17 @@ def tf_additional_lib_srcs():
       "platform/posix/*.cc",
   ]
 
+def tf_additional_proto_hdrs():
+  return [
+      "platform/default/integral_types.h",
+      "platform/default/logging.h",
+      "platform/default/protobuf.h"
+  ]
+
 def tf_additional_proto_srcs():
-  return ["platform/default/integral_types.h",
-          "platform/default/logging.h",
-          "platform/default/logging.cc",
-          "platform/default/protobuf.h",
-          "platform/default/protobuf.cc",
+  return [
+      "platform/default/logging.cc",
+      "platform/default/protobuf.cc",
   ]
 
 def tf_additional_stream_executor_srcs():
diff --git a/tensorflow/core/platform/default/fingerprint.h b/tensorflow/core/platform/default/fingerprint.h
index 087b1cdc254..71f9951e53e 100644
--- a/tensorflow/core/platform/default/fingerprint.h
+++ b/tensorflow/core/platform/default/fingerprint.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_FINGERPRINT_H_
 #define TENSORFLOW_CORE_PLATFORM_DEFAULT_FINGERPRINT_H_
 
-#include "farmhash-34c13ddfab0e35422f4c3979f360635a8c050260/src/farmhash.h"
+#include <farmhash.h>
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index dc4e6eb3fe7..1333bd00fc5 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -52,7 +52,7 @@ class LogMessage : public std::basic_ostringstream<char> {
 class LogMessageFatal : public LogMessage {
  public:
   LogMessageFatal(const char* file, int line) TF_ATTRIBUTE_COLD;
-  ~LogMessageFatal() TF_ATTRIBUTE_NORETURN;
+  TF_ATTRIBUTE_NORETURN ~LogMessageFatal();
 };
 
 #define _TF_LOG_INFO \
diff --git a/tensorflow/core/platform/default/test_benchmark.cc b/tensorflow/core/platform/default/test_benchmark.cc
index 9174e9ebb92..dedab42bd73 100644
--- a/tensorflow/core/platform/default/test_benchmark.cc
+++ b/tensorflow/core/platform/default/test_benchmark.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/util/reporter.h"
 
 namespace tensorflow {
@@ -114,6 +113,10 @@ Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
 void Benchmark::Run(const char* pattern) {
   if (!all_benchmarks) return;
 
+  // Converts "all" into the wildcard '.*'.  Currently pattern isn't
+  // specified by clients, but we keep this here to match the internal
+  // Google implementation, should we ever enable user-specified
+  // pattern specification.
   if (StringPiece(pattern) == "all") {
     pattern = ".*";
   }
@@ -131,9 +134,11 @@ void Benchmark::Run(const char* pattern) {
           strings::StrAppend(&name, "/", arg.second);
         }
       }
-      if (RE2::PartialMatch(name, pattern)) {
-        width = std::max<int>(width, name.size());
-      }
+
+      // TODO(vrv): Check against 'pattern' using a regex before
+      // computing the width, if we start allowing clients to pass in
+      // a custom pattern.
+      width = std::max<int>(width, name.size());
     }
   }
 
@@ -149,9 +154,10 @@ void Benchmark::Run(const char* pattern) {
           strings::StrAppend(&name, "/", arg.second);
         }
       }
-      if (!RE2::PartialMatch(name, pattern)) {
-        continue;
-      }
+
+      // TODO(vrv): Match 'name' against 'pattern' using a regex
+      // before continuing, if we start allowing clients to pass in a
+      // custom pattern.
 
       int iters;
       double seconds;
diff --git a/tensorflow/core/platform/demangle.h b/tensorflow/core/platform/demangle.h
index ee4c97757a8..c2def217a12 100644
--- a/tensorflow/core/platform/demangle.h
+++ b/tensorflow/core/platform/demangle.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
 
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 namespace port {
 
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 451e8a8d68f..4971df926ec 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -69,8 +69,9 @@ Status FileSystemRegistryImpl::GetRegisteredFileSystemSchemes(
 Env::Env() : file_system_registry_(new FileSystemRegistryImpl) {}
 
 Status Env::GetFileSystemForFile(const string& fname, FileSystem** result) {
-  string scheme = GetSchemeFromURI(fname);
-  FileSystem* file_system = file_system_registry_->Lookup(scheme);
+  StringPiece scheme, host, path;
+  ParseURI(fname, &scheme, &host, &path);
+  FileSystem* file_system = file_system_registry_->Lookup(scheme.ToString());
   if (!file_system) {
     return errors::Unimplemented("File system scheme ", scheme,
                                  " not implemented");
@@ -130,6 +131,47 @@ Status Env::GetChildren(const string& dir, std::vector<string>* result) {
   return fs->GetChildren(dir, result);
 }
 
+Status Env::GetMatchingPaths(const string& pattern,
+                             std::vector<string>* results) {
+  FileSystem* fs;
+  TF_RETURN_IF_ERROR(GetFileSystemForFile(pattern, &fs));
+  results->clear();
+  // Find the fixed prefix by looking for the first wildcard.
+  const string& fixed_prefix =
+      pattern.substr(0, pattern.find_first_of("*?[\\"));
+  std::vector<string> all_files;
+  string dir = io::Dirname(fixed_prefix).ToString();
+  if (dir.empty()) dir = ".";
+
+  // Setup a BFS to explore everything under dir.
+  std::deque<string> dir_q;
+  dir_q.push_back(dir);
+  Status ret;  // Status to return.
+  while (!dir_q.empty()) {
+    string current_dir = dir_q.front();
+    dir_q.pop_front();
+    std::vector<string> children;
+    Status s = fs->GetChildren(current_dir, &children);
+    ret.Update(s);
+    for (const string& child : children) {
+      const string child_path = io::JoinPath(current_dir, child);
+      // If the child is a directory add it to the queue.
+      if (fs->IsDirectory(child_path).ok()) {
+        dir_q.push_back(child_path);
+      }
+      all_files.push_back(child_path);
+    }
+  }
+
+  // Match all obtained files to the input pattern.
+  for (const auto& f : all_files) {
+    if (MatchPath(f, pattern)) {
+      results->push_back(f);
+    }
+  }
+  return ret;
+}
+
 Status Env::DeleteFile(const string& fname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 6bb7ac08ee4..d4a96c73086 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -141,6 +141,37 @@ class Env {
   /// Original contents of *results are dropped.
   Status GetChildren(const string& dir, std::vector<string>* result);
 
+  /// \brief Returns true if the path matches the given pattern. The wildcards
+  /// allowed in pattern are described below (GetMatchingPaths).
+  virtual bool MatchPath(const string& path, const string& pattern) = 0;
+
+  /// \brief Given a pattern, stores in *results the set of paths that matches
+  /// that pattern. *results is cleared.
+  ///
+  /// pattern must match all of a name, not just a substring.
+  //
+  /// pattern: { term }
+  /// term:
+  ///   '*': matches any sequence of non-'/' characters
+  ///   '?': matches a single non-'/' character
+  ///   '[' [ '^' ] { match-list } ']':
+  ///        matches any single character (not) on the list
+  ///   c: matches character c (c != '*', '?', '\\', '[')
+  ///   '\\' c: matches character c
+  /// character-range:
+  ///   c: matches character c (c != '\\', '-', ']')
+  ///   '\\' c: matches character c
+  ///   lo '-' hi: matches character c for lo <= c <= hi
+  ///
+  /// Typical return codes
+  ///  * OK - no errors
+  ///  * UNIMPLEMENTED - Some underlying functions (like GetChildren) are not
+  ///                    implemented
+  /// The default implementation uses a combination of GetChildren, MatchPath
+  /// and IsDirectory.
+  virtual Status GetMatchingPaths(const string& pattern,
+                                  std::vector<string>* results);
+
   /// Deletes the named file.
   Status DeleteFile(const string& fname);
 
@@ -251,11 +282,8 @@ class Env {
                                       void** symbol) = 0;
 
  private:
-  /// No copying allowed
-  Env(const Env&);
-  void operator=(const Env&);
-
   std::unique_ptr<FileSystemRegistry> file_system_registry_;
+  TF_DISALLOW_COPY_AND_ASSIGN(Env);
 };
 
 /// \brief An implementation of Env that forwards all calls to another Env.
@@ -285,6 +313,10 @@ class EnvWrapper : public Env {
     return target_->RegisterFileSystem(scheme, factory);
   }
 
+  bool MatchPath(const string& path, const string& pattern) override {
+    return target_->MatchPath(path, pattern);
+  }
+
   uint64 NowMicros() override { return target_->NowMicros(); }
   void SleepForMicroseconds(int64 micros) override {
     target_->SleepForMicroseconds(micros);
@@ -319,9 +351,7 @@ class Thread {
   virtual ~Thread();
 
  private:
-  /// No copying allowed
-  Thread(const Thread&);
-  void operator=(const Thread&);
+  TF_DISALLOW_COPY_AND_ASSIGN(Thread);
 };
 
 /// \brief Options to configure a Thread.
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 4fe1d7d8f77..6883cd54f3f 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -19,13 +19,12 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 
-struct EnvTest {};
-
 namespace {
 string CreateTestFile(Env* env, const string& filename, int length) {
   string input(length, 0);
@@ -35,208 +34,193 @@ string CreateTestFile(Env* env, const string& filename, int length) {
 }
 }  // namespace
 
-TEST(EnvTest, ReadFileToString) {
-  Env* env = Env::Default();
-  const string dir = testing::TmpDir();
+class DefaultEnvTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    base_dir_ = io::JoinPath(testing::TmpDir(), "base_dir");
+    env_->CreateDir(base_dir_);
+  }
+
+  void TearDown() override {
+    int64 undeleted_files, undeleted_dirs;
+    env_->DeleteRecursively(base_dir_, &undeleted_files, &undeleted_dirs);
+  }
+
+  string base_dir_;
+  Env* env_ = Env::Default();
+};
+
+TEST_F(DefaultEnvTest, ReadFileToString) {
   for (const int length : {0, 1, 1212, 2553, 4928, 8196, 9000, (1 << 20) - 1,
                            1 << 20, (1 << 20) + 1}) {
-    const string filename = strings::StrCat(dir, "/bar/..//file", length);
+    const string filename = strings::StrCat(base_dir_, "/bar/..//file", length);
 
     // Write a file with the given length
-    const string input = CreateTestFile(env, filename, length);
+    const string input = CreateTestFile(env_, filename, length);
 
     // Read the file back and check equality
     string output;
-    TF_EXPECT_OK(ReadFileToString(env, filename, &output));
+    TF_EXPECT_OK(ReadFileToString(env_, filename, &output));
     EXPECT_EQ(length, output.size());
     EXPECT_EQ(input, output);
 
     // Obtain stats.
     FileStatistics stat;
-    TF_EXPECT_OK(env->Stat(filename, &stat));
+    TF_EXPECT_OK(env_->Stat(filename, &stat));
     EXPECT_EQ(length, stat.length);
     EXPECT_FALSE(stat.is_directory);
   }
 }
 
-TEST(EnvTest, FileToReadonlyMemoryRegion) {
-  Env* env = Env::Default();
-  const string dir = testing::TmpDir();
+TEST_F(DefaultEnvTest, FileToReadonlyMemoryRegion) {
   for (const int length : {1, 1212, 2553, 4928, 8196, 9000, (1 << 20) - 1,
                            1 << 20, (1 << 20) + 1}) {
-    const string filename = io::JoinPath(dir, strings::StrCat("file", length));
+    const string filename =
+        io::JoinPath(base_dir_, strings::StrCat("file", length));
 
     // Write a file with the given length
-    const string input = CreateTestFile(env, filename, length);
+    const string input = CreateTestFile(env_, filename, length);
 
     // Create the region.
     std::unique_ptr<ReadOnlyMemoryRegion> region;
-    TF_EXPECT_OK(env->NewReadOnlyMemoryRegionFromFile(filename, &region));
+    TF_EXPECT_OK(env_->NewReadOnlyMemoryRegionFromFile(filename, &region));
     ASSERT_NE(region, nullptr);
     EXPECT_EQ(length, region->length());
     EXPECT_EQ(input, string(reinterpret_cast<const char*>(region->data()),
                             region->length()));
     FileStatistics stat;
-    TF_EXPECT_OK(env->Stat(filename, &stat));
+    TF_EXPECT_OK(env_->Stat(filename, &stat));
     EXPECT_EQ(length, stat.length);
     EXPECT_FALSE(stat.is_directory);
   }
 }
 
-TEST(EnvTest, DeleteRecursively) {
-  Env* env = Env::Default();
+TEST_F(DefaultEnvTest, DeleteRecursively) {
   // Build a directory structure rooted at root_dir.
   // root_dir -> dirs: child_dir1, child_dir2; files: root_file1, root_file2
   // child_dir1 -> files: child1_file1
   // child_dir2 -> empty
-  const string parent_dir = io::JoinPath(testing::TmpDir(), "root_dir");
+  const string parent_dir = io::JoinPath(base_dir_, "root_dir");
   const string child_dir1 = io::JoinPath(parent_dir, "child_dir1");
   const string child_dir2 = io::JoinPath(parent_dir, "child_dir2");
-  TF_EXPECT_OK(env->CreateDir(parent_dir));
+  TF_EXPECT_OK(env_->CreateDir(parent_dir));
   const string root_file1 = io::JoinPath(parent_dir, "root_file1");
   const string root_file2 = io::JoinPath(parent_dir, "root_file2");
   const string root_file3 = io::JoinPath(parent_dir, ".root_file3");
-  CreateTestFile(env, root_file1, 100);
-  CreateTestFile(env, root_file2, 100);
-  CreateTestFile(env, root_file3, 100);
-  TF_EXPECT_OK(env->CreateDir(child_dir1));
+  CreateTestFile(env_, root_file1, 100);
+  CreateTestFile(env_, root_file2, 100);
+  CreateTestFile(env_, root_file3, 100);
+  TF_EXPECT_OK(env_->CreateDir(child_dir1));
   const string child1_file1 = io::JoinPath(child_dir1, "child1_file1");
-  CreateTestFile(env, child1_file1, 100);
-  TF_EXPECT_OK(env->CreateDir(child_dir2));
+  CreateTestFile(env_, child1_file1, 100);
+  TF_EXPECT_OK(env_->CreateDir(child_dir2));
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(
-      env->DeleteRecursively(parent_dir, &undeleted_files, &undeleted_dirs));
+      env_->DeleteRecursively(parent_dir, &undeleted_files, &undeleted_dirs));
   EXPECT_EQ(0, undeleted_files);
   EXPECT_EQ(0, undeleted_dirs);
-  EXPECT_FALSE(env->FileExists(root_file1));
-  EXPECT_FALSE(env->FileExists(root_file2));
-  EXPECT_FALSE(env->FileExists(root_file3));
-  EXPECT_FALSE(env->FileExists(child1_file1));
+  EXPECT_FALSE(env_->FileExists(root_file1));
+  EXPECT_FALSE(env_->FileExists(root_file2));
+  EXPECT_FALSE(env_->FileExists(root_file3));
+  EXPECT_FALSE(env_->FileExists(child1_file1));
 }
 
-TEST(EnvTest, DeleteRecursivelyFail) {
+TEST_F(DefaultEnvTest, DeleteRecursivelyFail) {
   // Try to delete a non-existent directory.
-  Env* env = Env::Default();
-  const string parent_dir = io::JoinPath(testing::TmpDir(), "root_dir");
+  const string parent_dir = io::JoinPath(base_dir_, "root_dir");
 
   int64 undeleted_files, undeleted_dirs;
   Status s =
-      env->DeleteRecursively(parent_dir, &undeleted_files, &undeleted_dirs);
+      env_->DeleteRecursively(parent_dir, &undeleted_files, &undeleted_dirs);
   EXPECT_EQ("Not found: Directory doesn't exist", s.ToString());
   EXPECT_EQ(0, undeleted_files);
   EXPECT_EQ(1, undeleted_dirs);
 }
 
-TEST(EnvTest, RecursivelyCreateDir) {
-  Env* env = Env::Default();
-  const string create_path = io::JoinPath(testing::TmpDir(), "a//b/c/d");
-  TF_CHECK_OK(env->RecursivelyCreateDir(create_path));
-  TF_CHECK_OK(env->RecursivelyCreateDir(create_path));  // repeat creation.
-  EXPECT_TRUE(env->FileExists(create_path));
-
-  // Clean up.
-  // TODO(rohanj): Do this more elegantly using SetUp() and TearDown() methods.
-  int64 undeleted_files, undeleted_dirs;
-  TF_CHECK_OK(env->DeleteRecursively(io::JoinPath(testing::TmpDir(), "a"),
-                                     &undeleted_files, &undeleted_dirs));
+TEST_F(DefaultEnvTest, RecursivelyCreateDir) {
+  const string create_path = io::JoinPath(base_dir_, "a//b/c/d");
+  TF_CHECK_OK(env_->RecursivelyCreateDir(create_path));
+  TF_CHECK_OK(env_->RecursivelyCreateDir(create_path));  // repeat creation.
+  EXPECT_TRUE(env_->FileExists(create_path));
 }
 
-TEST(EnvTest, RecursivelyCreateDirEmpty) {
-  Env* env = Env::Default();
-  TF_CHECK_OK(env->RecursivelyCreateDir(""));
+TEST_F(DefaultEnvTest, RecursivelyCreateDirEmpty) {
+  TF_CHECK_OK(env_->RecursivelyCreateDir(""));
 }
 
-TEST(EnvTest, RecursivelyCreateDirSubdirsExist) {
-  Env* env = Env::Default();
+TEST_F(DefaultEnvTest, RecursivelyCreateDirSubdirsExist) {
   // First create a/b.
-  const string subdir_path = io::JoinPath(testing::TmpDir(), "a/b");
-  TF_CHECK_OK(env->CreateDir(io::JoinPath(testing::TmpDir(), "a")));
-  TF_CHECK_OK(env->CreateDir(subdir_path));
-  EXPECT_TRUE(env->FileExists(subdir_path));
+  const string subdir_path = io::JoinPath(base_dir_, "a/b");
+  TF_CHECK_OK(env_->CreateDir(io::JoinPath(base_dir_, "a")));
+  TF_CHECK_OK(env_->CreateDir(subdir_path));
+  EXPECT_TRUE(env_->FileExists(subdir_path));
 
   // Now try to recursively create a/b/c/d/
-  const string create_path = io::JoinPath(testing::TmpDir(), "a/b/c/d/");
-  TF_CHECK_OK(env->RecursivelyCreateDir(create_path));
-  TF_CHECK_OK(env->RecursivelyCreateDir(create_path));  // repeat creation.
-  EXPECT_TRUE(env->FileExists(create_path));
-  EXPECT_TRUE(env->FileExists(io::JoinPath(testing::TmpDir(), "a/b/c")));
-
-  // Clean up.
-  int64 undeleted_files, undeleted_dirs;
-  TF_CHECK_OK(env->DeleteRecursively(io::JoinPath(testing::TmpDir(), "a"),
-                                     &undeleted_files, &undeleted_dirs));
+  const string create_path = io::JoinPath(base_dir_, "a/b/c/d/");
+  TF_CHECK_OK(env_->RecursivelyCreateDir(create_path));
+  TF_CHECK_OK(env_->RecursivelyCreateDir(create_path));  // repeat creation.
+  EXPECT_TRUE(env_->FileExists(create_path));
+  EXPECT_TRUE(env_->FileExists(io::JoinPath(base_dir_, "a/b/c")));
 }
 
-TEST(EnvTest, LocalFileSystem) {
+TEST_F(DefaultEnvTest, LocalFileSystem) {
   // Test filename with file:// syntax.
-  Env* env = Env::Default();
-  const string dir = testing::TmpDir();
   for (const int length : {0, 1, 1212, 2553, 4928, 8196, 9000, (1 << 20) - 1,
                            1 << 20, (1 << 20) + 1}) {
-    string filename = io::JoinPath(dir, strings::StrCat("file", length));
+    string filename = io::JoinPath(base_dir_, strings::StrCat("file", length));
 
     filename = strings::StrCat("file://", filename);
 
     // Write a file with the given length
-    const string input = CreateTestFile(env, filename, length);
+    const string input = CreateTestFile(env_, filename, length);
 
     // Read the file back and check equality
     string output;
-    TF_EXPECT_OK(ReadFileToString(env, filename, &output));
+    TF_EXPECT_OK(ReadFileToString(env_, filename, &output));
     EXPECT_EQ(length, output.size());
     EXPECT_EQ(input, output);
 
     FileStatistics stat;
-    TF_EXPECT_OK(env->Stat(filename, &stat));
+    TF_EXPECT_OK(env_->Stat(filename, &stat));
     EXPECT_EQ(length, stat.length);
     EXPECT_FALSE(stat.is_directory);
   }
 }
 
-class InterPlanetaryFileSystem : public NullFileSystem {
- public:
-  Status GetChildren(const string& dir, std::vector<string>* result) override {
-    std::vector<string> Planets = {"Mercury", "Venus",   "Earth",
-                                   "Mars",    "Jupiter", "Saturn",
-                                   "Uranus",  "Neptune", ".PlanetX"};
-    result->insert(result->end(), Planets.begin(), Planets.end());
-    return Status::OK();
-  }
-};
+#define EXPECT_PARSE_URI(uri, scheme, host, path) \
+  do {                                            \
+    StringPiece s, h, p;                          \
+    ParseURI(uri, &s, &h, &p);                    \
+    EXPECT_EQ(scheme, s.ToString());              \
+    EXPECT_EQ(host, h.ToString());                \
+    EXPECT_EQ(path, p.ToString());                \
+  } while (0)
 
-REGISTER_FILE_SYSTEM("ipfs", InterPlanetaryFileSystem);
-
-TEST(EnvTest, IPFS) {
-  Env* env = Env::Default();
-  std::vector<string> planets;
-  TF_EXPECT_OK(env->GetChildren("ipfs://solarsystem", &planets));
-  int c = 0;
-  std::vector<string> Planets = {"Mercury", "Venus",   "Earth",
-                                 "Mars",    "Jupiter", "Saturn",
-                                 "Uranus",  "Neptune", ".PlanetX"};
-  for (auto p : Planets) {
-    EXPECT_EQ(p, planets[c++]);
-  }
+TEST_F(DefaultEnvTest, ParseURI) {
+  EXPECT_PARSE_URI("http://foo", "http", "foo", "");
+  EXPECT_PARSE_URI("/encrypted/://foo", "", "", "/encrypted/://foo");
+  EXPECT_PARSE_URI("/usr/local/foo", "", "", "/usr/local/foo");
+  EXPECT_PARSE_URI("file:///usr/local/foo", "file", "", "/usr/local/foo");
+  EXPECT_PARSE_URI("local.file:///usr/local/foo", "local.file", "",
+                   "/usr/local/foo");
+  EXPECT_PARSE_URI("a-b:///foo", "", "", "a-b:///foo");
+  EXPECT_PARSE_URI(":///foo", "", "", ":///foo");
+  EXPECT_PARSE_URI("9dfd:///foo", "", "", "9dfd:///foo");
+  EXPECT_PARSE_URI("file:", "", "", "file:");
+  EXPECT_PARSE_URI("file:/", "", "", "file:/");
+  EXPECT_PARSE_URI("hdfs://localhost:8020/path/to/file", "hdfs",
+                   "localhost:8020", "/path/to/file");
+  EXPECT_PARSE_URI("hdfs://localhost:8020", "hdfs", "localhost:8020", "");
+  EXPECT_PARSE_URI("hdfs://localhost:8020/", "hdfs", "localhost:8020", "/");
 }
+#undef EXPECT_PARSE_URI
 
-TEST(EnvTest, GetSchemeForURI) {
-  EXPECT_EQ(GetSchemeFromURI("http://foo"), "http");
-  EXPECT_EQ(GetSchemeFromURI("/encrypted/://foo"), "");
-  EXPECT_EQ(GetSchemeFromURI("/usr/local/foo"), "");
-  EXPECT_EQ(GetSchemeFromURI("file:///usr/local/foo"), "file");
-  EXPECT_EQ(GetSchemeFromURI("local.file:///usr/local/foo"), "local.file");
-  EXPECT_EQ(GetSchemeFromURI("a-b:///foo"), "");
-  EXPECT_EQ(GetSchemeFromURI(":///foo"), "");
-  EXPECT_EQ(GetSchemeFromURI("9dfd:///foo"), "");
-}
-
-TEST(EnvTest, SleepForMicroseconds) {
-  Env* env = Env::Default();
-  const int64 start = env->NowMicros();
+TEST_F(DefaultEnvTest, SleepForMicroseconds) {
+  const int64 start = env_->NowMicros();
   const int64 sleep_time = 1e6 + 5e5;
-  env->SleepForMicroseconds(sleep_time);
-  const int64 delta = env->NowMicros() - start;
+  env_->SleepForMicroseconds(sleep_time);
+  const int64 delta = env_->NowMicros() - start;
 
   // Subtract 10 from the sleep_time for this check because NowMicros can
   // sometimes give slightly inconsistent values between the start and the
@@ -244,4 +228,196 @@ TEST(EnvTest, SleepForMicroseconds) {
   EXPECT_GE(delta, sleep_time - 10);
 }
 
+// Creates a new TestEnv that uses Env::Default for all basic ops but
+// uses the default implementation for the GetMatchingFiles function instead.
+class TestEnv : public EnvWrapper {
+ public:
+  explicit TestEnv(Env* env) : EnvWrapper(env) {}
+
+  ~TestEnv() override = default;
+};
+
+Env* GetTestEnv() {
+  static Env* default_env = new TestEnv(Env::Default());
+  return default_env;
+}
+
+class InterPlanetaryFileSystem : public NullFileSystem {
+ public:
+  Status IsDirectory(const string& dirname) override {
+    if (dirname == "ipfs://solarsystem" ||
+        dirname == "ipfs://solarsystem/Earth" ||
+        dirname == "ipfs://solarsystem/Jupiter") {
+      return Status::OK();
+    }
+    return Status(tensorflow::error::FAILED_PRECONDITION, "Not a directory");
+  }
+
+  Status GetChildren(const string& dir, std::vector<string>* result) override {
+    std::vector<string> celestial_bodies;
+    if (dir == "ipfs://solarsystem") {
+      celestial_bodies = {"Mercury",  "Venus",   "Earth",  "Mars",
+                          "Jupiter",  "Saturn",  "Uranus", "Neptune",
+                          ".PlanetX", "Planet0", "Planet1"};
+
+    } else if (dir == "ipfs://solarsystem/Earth") {
+      celestial_bodies = {"Moon"};
+    } else if (dir == "ipfs://solarsystem/Jupiter") {
+      celestial_bodies = {"Europa", "Io", "Ganymede"};
+    }
+    result->insert(result->end(), celestial_bodies.begin(),
+                   celestial_bodies.end());
+    return Status::OK();
+  }
+};
+
+REGISTER_FILE_SYSTEM_ENV(GetTestEnv(), "ipfs", InterPlanetaryFileSystem);
+
+class TestEnvTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    base_dir_ = io::JoinPath(testing::TmpDir(), "base_dir");
+    env_->CreateDir(base_dir_);
+  }
+
+  void TearDown() override {
+    int64 undeleted_files, undeleted_dirs;
+    env_->DeleteRecursively(base_dir_, &undeleted_files, &undeleted_dirs);
+  }
+
+  // Returns all the matched entries as a comma separated string removing the
+  // common prefix of base_dir_.
+  string Match(const string& base_dir, const string& suffix_pattern) {
+    std::vector<string> results;
+    Status s = env_->GetMatchingPaths(io::JoinPath(base_dir, suffix_pattern),
+                                      &results);
+    if (!s.ok()) {
+      return s.ToString();
+    } else {
+      std::vector<StringPiece> trimmed_results;
+      std::sort(results.begin(), results.end());
+      for (const string& result : results) {
+        StringPiece trimmed_result(result);
+        EXPECT_TRUE(trimmed_result.Consume(base_dir + "/"));
+        trimmed_results.push_back(trimmed_result);
+      }
+      return str_util::Join(trimmed_results, ",");
+    }
+  }
+
+  string base_dir_;
+  Env* env_ = GetTestEnv();
+};
+
+TEST_F(TestEnvTest, IPFS) {
+  std::vector<string> matched_planets;
+  TF_EXPECT_OK(env_->GetChildren("ipfs://solarsystem", &matched_planets));
+  std::vector<string> planets = {"Mercury",  "Venus",   "Earth",  "Mars",
+                                 "Jupiter",  "Saturn",  "Uranus", "Neptune",
+                                 ".PlanetX", "Planet0", "Planet1"};
+  int c = 0;
+  for (auto p : matched_planets) {
+    EXPECT_EQ(p, planets[c++]);
+  }
+}
+
+TEST_F(TestEnvTest, IPFSMatch) {
+  // Make sure we only get the 11 planets and not all their children.
+  EXPECT_EQ(Match("ipfs://solarsystem", "*"),
+            ".PlanetX,Earth,Jupiter,Mars,Mercury,Neptune,Planet0,Planet1,"
+            "Saturn,Uranus,Venus");
+  // Returns Jupiter's moons.
+  EXPECT_EQ(Match("ipfs://solarsystem", "Jupiter/*"),
+            "Jupiter/Europa,Jupiter/Ganymede,Jupiter/Io");
+  // Returns Jupiter's and Earth's moons.
+  EXPECT_EQ(Match("ipfs://solarsystem", "*/*"),
+            "Earth/Moon,Jupiter/Europa,Jupiter/Ganymede,Jupiter/Io");
+  EXPECT_EQ(Match("ipfs://solarsystem", "Planet[0-1]"), "Planet0,Planet1");
+}
+
+TEST_F(TestEnvTest, MatchNonExistentFile) {
+  EXPECT_EQ(Match(base_dir_, "thereisnosuchfile"), "");
+}
+
+TEST_F(TestEnvTest, MatchSimple) {
+  // Create a few files.
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-00"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-0a"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-01"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-aaa"), ""));
+
+  EXPECT_EQ(Match(base_dir_, "match-*"),
+            "match-00,match-01,match-0a,match-aaa");
+  EXPECT_EQ(Match(base_dir_, "match-0[0-9]"), "match-00,match-01");
+  EXPECT_EQ(Match(base_dir_, "match-?[0-9]"), "match-00,match-01");
+  EXPECT_EQ(Match(base_dir_, "match-?a*"), "match-0a,match-aaa");
+  EXPECT_EQ(Match(base_dir_, "match-??"), "match-00,match-01,match-0a");
+}
+
+TEST_F(TestEnvTest, MatchDirectory) {
+  // Create some directories.
+  TF_EXPECT_OK(
+      env_->RecursivelyCreateDir(io::JoinPath(base_dir_, "match-00/abc")));
+  TF_EXPECT_OK(
+      env_->RecursivelyCreateDir(io::JoinPath(base_dir_, "match-0a/abc")));
+  TF_EXPECT_OK(
+      env_->RecursivelyCreateDir(io::JoinPath(base_dir_, "match-01/abc")));
+  TF_EXPECT_OK(
+      env_->RecursivelyCreateDir(io::JoinPath(base_dir_, "match-aaa/abc")));
+
+  // Create a few files.
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-00/abc/x"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-0a/abc/x"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-01/abc/x"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-aaa/abc/x"), ""));
+
+  EXPECT_EQ(Match(base_dir_, "match-*/abc/x"),
+            "match-00/abc/x,match-01/abc/x,match-0a/abc/x,match-aaa/abc/x");
+  EXPECT_EQ(Match(base_dir_, "match-0[0-9]/abc/x"),
+            "match-00/abc/x,match-01/abc/x");
+  EXPECT_EQ(Match(base_dir_, "match-?[0-9]/abc/x"),
+            "match-00/abc/x,match-01/abc/x");
+  EXPECT_EQ(Match(base_dir_, "match-?a*/abc/x"),
+            "match-0a/abc/x,match-aaa/abc/x");
+  EXPECT_EQ(Match(base_dir_, "match-?[^a]/abc/x"),
+            "match-00/abc/x,match-01/abc/x");
+}
+
+TEST_F(TestEnvTest, MatchMultipleWildcards) {
+  // Create some directories.
+  TF_EXPECT_OK(
+      env_->RecursivelyCreateDir(io::JoinPath(base_dir_, "match-00/abc")));
+  TF_EXPECT_OK(
+      env_->RecursivelyCreateDir(io::JoinPath(base_dir_, "match-01/abc")));
+  TF_EXPECT_OK(
+      env_->RecursivelyCreateDir(io::JoinPath(base_dir_, "match-02/abc")));
+
+  // Create a few files.
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-00/abc/00"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-00/abc/01"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-00/abc/09"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-01/abc/00"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-01/abc/04"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-01/abc/10"), ""));
+  TF_EXPECT_OK(
+      WriteStringToFile(env_, io::JoinPath(base_dir_, "match-02/abc/00"), ""));
+
+  EXPECT_EQ(Match(base_dir_, "match-0[0-1]/abc/0[0-8]"),
+            "match-00/abc/00,match-00/abc/01,match-01/abc/00,match-01/abc/04");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index e168101a058..928bf644f4e 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -51,35 +51,34 @@ WritableFile::~WritableFile() {}
 
 FileSystemRegistry::~FileSystemRegistry() {}
 
-string GetSchemeFromURI(const string& name) {
-  auto colon_loc = name.find(":");
+void ParseURI(StringPiece remaining, StringPiece* scheme, StringPiece* host,
+              StringPiece* path) {
+  // 0. Parse scheme
   // Make sure scheme matches [a-zA-Z][0-9a-zA-Z.]*
   // TODO(keveman): Allow "+" and "-" in the scheme.
-  if (colon_loc != string::npos &&
-      strings::Scanner(StringPiece(name.data(), colon_loc))
-          .One(strings::Scanner::LETTER)
-          .Many(strings::Scanner::LETTER_DIGIT_DOT)
-          .GetResult()) {
-    return name.substr(0, colon_loc);
+  if (!strings::Scanner(remaining)
+           .One(strings::Scanner::LETTER)
+           .Many(strings::Scanner::LETTER_DIGIT_DOT)
+           .StopCapture()
+           .OneLiteral("://")
+           .GetResult(&remaining, scheme)) {
+    // If there's no scheme, assume the entire string is a path.
+    scheme->clear();
+    host->clear();
+    *path = remaining;
+    return;
   }
-  return "";
-}
 
-string GetNameFromURI(const string& name) {
-  string scheme = GetSchemeFromURI(name);
-  if (scheme == "") {
-    return name;
+  // 1. Parse host
+  if (!strings::Scanner(remaining).ScanUntil('/').GetResult(&remaining, host)) {
+    // No path, so the rest of the URI is the host.
+    *host = remaining;
+    path->clear();
+    return;
   }
-  // Skip the 'scheme:' portion.
-  StringPiece filename{name.data() + scheme.length() + 1,
-                       name.length() - scheme.length() - 1};
-  // If the URI confirmed to scheme://filename, skip the two '/'s and return
-  // filename. Otherwise return the original 'name', and leave it up to the
-  // implementations to handle the full URI.
-  if (filename.Consume("//")) {
-    return filename.ToString();
-  }
-  return name;
+
+  // 2. The rest is the path
+  *path = remaining;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 4941a7b474b..2c653dded0c 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -175,9 +175,7 @@ class RandomAccessFile {
                       char* scratch) const = 0;
 
  private:
-  /// No copying allowed
-  RandomAccessFile(const RandomAccessFile&);
-  void operator=(const RandomAccessFile&);
+  TF_DISALLOW_COPY_AND_ASSIGN(RandomAccessFile);
 };
 
 /// \brief A file abstraction for sequential writing.
@@ -195,9 +193,7 @@ class WritableFile {
   virtual Status Sync() = 0;
 
  private:
-  /// No copying allowed
-  WritableFile(const WritableFile&);
-  void operator=(const WritableFile&);
+  TF_DISALLOW_COPY_AND_ASSIGN(WritableFile);
 };
 
 /// \brief A readonly memmapped file abstraction.
@@ -229,11 +225,14 @@ class FileSystemRegistry {
       std::vector<string>* schemes) = 0;
 };
 
-// Given URI of the form [scheme://]<filename>, return 'scheme'.
-string GetSchemeFromURI(const string& name);
-
-// Given URI of the form [scheme://]<filename>, return 'filename'.
-string GetNameFromURI(const string& name);
+// Populates the scheme, host, and path from a URI.
+//
+// Corner cases:
+// - If the URI is invalid, scheme and host are set to empty strings and the
+//   passed string is assumed to be a path
+// - If the URI omits the path (e.g. file://host), then the path is left empty.
+void ParseURI(StringPiece uri, StringPiece* scheme, StringPiece* host,
+              StringPiece* path);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/gif.h b/tensorflow/core/platform/gif.h
index d5567abeeac..71cecea37ac 100644
--- a/tensorflow/core/platform/gif.h
+++ b/tensorflow/core/platform/gif.h
@@ -21,7 +21,7 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/gif.h"
 #elif defined(PLATFORM_POSIX) && !defined(IS_MOBILE_PLATFORM)
-#include "giflib-5.1.4/lib/gif_lib.h"
+#include <gif_lib.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
diff --git a/tensorflow/core/platform/hadoop/BUILD b/tensorflow/core/platform/hadoop/BUILD
index 457b4fe14cb..b7c43636655 100644
--- a/tensorflow/core/platform/hadoop/BUILD
+++ b/tensorflow/core/platform/hadoop/BUILD
@@ -31,7 +31,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/third_party/hadoop:hdfs",
+        "//third_party/hadoop:hdfs",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 662818cc9a7..05615a1a208 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -21,9 +21,10 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
-#include "tensorflow/third_party/hadoop/hdfs.h"
+#include "third_party/hadoop/hdfs.h"
 
 namespace tensorflow {
 
@@ -124,22 +125,14 @@ HadoopFileSystem::~HadoopFileSystem() {}
 Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   TF_RETURN_IF_ERROR(hdfs_->status());
 
-  if (!fname.Consume("hdfs://")) {
-    return errors::InvalidArgument("HDFS path must start with hdfs://");
-  }
-  auto first_slash = fname.find('/');
-  string namenode;
-  if (first_slash == string::npos) {
-    namenode = fname.ToString();
-  } else {
-    namenode = fname.substr(0, first_slash).ToString();
-  }
+  StringPiece scheme, namenode, path;
+  ParseURI(fname, &scheme, &namenode, &path);
 
   hdfsBuilder* builder = hdfs_->hdfsNewBuilder();
-  if (namenode == "localfilesystem") {
+  if (scheme == "file") {
     hdfs_->hdfsBuilderSetNameNode(builder, nullptr);
   } else {
-    hdfs_->hdfsBuilderSetNameNode(builder, namenode.c_str());
+    hdfs_->hdfsBuilderSetNameNode(builder, namenode.ToString().c_str());
   }
   *fs = hdfs_->hdfsBuilderConnect(builder);
   if (*fs == nullptr) {
@@ -149,14 +142,9 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
 }
 
 string HadoopFileSystem::TranslateName(const string& name) const {
-  StringPiece sp = name;
-  sp.Consume("hdfs://");
-  auto first_slash = sp.find('/');
-  if (first_slash == string::npos) {
-    return string();
-  }
-  sp.remove_prefix(first_slash);
-  return sp.ToString();
+  StringPiece scheme, namenode, path;
+  ParseURI(name, &scheme, &namenode, &path);
+  return path.ToString();
 }
 
 class HDFSRandomAccessFile : public RandomAccessFile {
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
index f927e15752f..dfe0590f3b3 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
@@ -58,8 +58,8 @@ class HadoopFileSystemTest : public ::testing::Test {
 };
 
 TEST_F(HadoopFileSystemTest, RandomAccessFile) {
-  const string fname = io::JoinPath("hdfs://localfilesystem", testing::TmpDir(),
-                                    "RandomAccessFile");
+  const string fname =
+      "file://" + io::JoinPath(testing::TmpDir(), "RandomAccessFile");
   const string content = "abcdefghijklmn";
   TF_ASSERT_OK(WriteString(fname, content));
 
@@ -84,7 +84,7 @@ TEST_F(HadoopFileSystemTest, RandomAccessFile) {
 TEST_F(HadoopFileSystemTest, WritableFile) {
   std::unique_ptr<WritableFile> writer;
   const string fname =
-      io::JoinPath("hdfs://localfilesystem", testing::TmpDir(), "WritableFile");
+      "file://" + io::JoinPath(testing::TmpDir(), "WritableFile");
   TF_EXPECT_OK(hdfs.NewWritableFile(fname, &writer));
   TF_EXPECT_OK(writer->Append("content1,"));
   TF_EXPECT_OK(writer->Append("content2"));
@@ -99,7 +99,7 @@ TEST_F(HadoopFileSystemTest, WritableFile) {
 
 TEST_F(HadoopFileSystemTest, FileExists) {
   const string fname =
-      io::JoinPath("hdfs://localfilesystem", testing::TmpDir(), "FileExists");
+      "file://" + io::JoinPath(testing::TmpDir(), "FileExists");
   EXPECT_FALSE(hdfs.FileExists(fname));
   TF_ASSERT_OK(WriteString(fname, "test"));
   EXPECT_TRUE(hdfs.FileExists(fname));
@@ -107,7 +107,7 @@ TEST_F(HadoopFileSystemTest, FileExists) {
 
 TEST_F(HadoopFileSystemTest, GetChildren) {
   const string base =
-      io::JoinPath("hdfs://localfilesystem", testing::TmpDir(), "GetChildren");
+      "file://" + io::JoinPath(testing::TmpDir(), "GetChildren");
   TF_EXPECT_OK(hdfs.CreateDir(base));
 
   const string file = io::JoinPath(base, "testfile.csv");
@@ -115,15 +115,15 @@ TEST_F(HadoopFileSystemTest, GetChildren) {
   const string subdir = io::JoinPath(base, "subdir");
   TF_EXPECT_OK(hdfs.CreateDir(subdir));
 
-  vector<string> children;
+  std::vector<string> children;
   TF_EXPECT_OK(hdfs.GetChildren(base, &children));
   std::sort(children.begin(), children.end());
-  EXPECT_EQ(vector<string>({"subdir", "testfile.csv"}), children);
+  EXPECT_EQ(std::vector<string>({"subdir", "testfile.csv"}), children);
 }
 
 TEST_F(HadoopFileSystemTest, DeleteFile) {
   const string fname =
-      io::JoinPath("hdfs://localfilesystem", testing::TmpDir(), "DeleteFile");
+      "file://" + io::JoinPath(testing::TmpDir(), "DeleteFile");
   EXPECT_FALSE(hdfs.DeleteFile(fname).ok());
   TF_ASSERT_OK(WriteString(fname, "test"));
   TF_EXPECT_OK(hdfs.DeleteFile(fname));
@@ -131,7 +131,7 @@ TEST_F(HadoopFileSystemTest, DeleteFile) {
 
 TEST_F(HadoopFileSystemTest, GetFileSize) {
   const string fname =
-      io::JoinPath("hdfs://localfilesystem", testing::TmpDir(), "GetFileSize");
+      "file://" + io::JoinPath(testing::TmpDir(), "GetFileSize");
   TF_ASSERT_OK(WriteString(fname, "test"));
   uint64 file_size = 0;
   TF_EXPECT_OK(hdfs.GetFileSize(fname, &file_size));
@@ -139,8 +139,8 @@ TEST_F(HadoopFileSystemTest, GetFileSize) {
 }
 
 TEST_F(HadoopFileSystemTest, CreateDirStat) {
-  const string dir = io::JoinPath("hdfs://localfilesystem", testing::TmpDir(),
-                                  "CreateDirStat");
+  const string dir =
+      "file://" + io::JoinPath(testing::TmpDir(), "CreateDirStat");
   TF_EXPECT_OK(hdfs.CreateDir(dir));
   FileStatistics stat;
   TF_EXPECT_OK(hdfs.Stat(dir, &stat));
@@ -148,8 +148,7 @@ TEST_F(HadoopFileSystemTest, CreateDirStat) {
 }
 
 TEST_F(HadoopFileSystemTest, DeleteDir) {
-  const string dir =
-      io::JoinPath("hdfs://localfilesystem", testing::TmpDir(), "DeleteDir");
+  const string dir = "file://" + io::JoinPath(testing::TmpDir(), "DeleteDir");
   EXPECT_FALSE(hdfs.DeleteDir(dir).ok());
   TF_EXPECT_OK(hdfs.CreateDir(dir));
   TF_EXPECT_OK(hdfs.DeleteDir(dir));
@@ -159,9 +158,9 @@ TEST_F(HadoopFileSystemTest, DeleteDir) {
 
 TEST_F(HadoopFileSystemTest, RenameFile) {
   const string fname1 =
-      io::JoinPath("hdfs://localfilesystem", testing::TmpDir(), "RenameFile1");
+      "file://" + io::JoinPath(testing::TmpDir(), "RenameFile1");
   const string fname2 =
-      io::JoinPath("hdfs://localfilesystem", testing::TmpDir(), "RenameFile2");
+      "file://" + io::JoinPath(testing::TmpDir(), "RenameFile2");
   TF_ASSERT_OK(WriteString(fname1, "test"));
   TF_EXPECT_OK(hdfs.RenameFile(fname1, fname2));
   string content;
@@ -170,8 +169,7 @@ TEST_F(HadoopFileSystemTest, RenameFile) {
 }
 
 TEST_F(HadoopFileSystemTest, StatFile) {
-  const string fname =
-      io::JoinPath("hdfs://localfilesystem", testing::TmpDir(), "StatFile");
+  const string fname = "file://" + io::JoinPath(testing::TmpDir(), "StatFile");
   TF_ASSERT_OK(WriteString(fname, "test"));
   FileStatistics stat;
   TF_EXPECT_OK(hdfs.Stat(fname, &stat));
diff --git a/tensorflow/core/platform/jpeg.h b/tensorflow/core/platform/jpeg.h
index 4306860f9a1..c9ddc23ff17 100644
--- a/tensorflow/core/platform/jpeg.h
+++ b/tensorflow/core/platform/jpeg.h
@@ -21,11 +21,13 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/jpeg.h"
 #elif defined(PLATFORM_POSIX) && !defined(IS_MOBILE_PLATFORM)
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
 extern "C" {
-#include "jpeg-9a/jerror.h"
-#include "jpeg-9a/jinclude.h"
-#include "jpeg-9a/jpeglib.h"
-#include "jpeg-9a/transupp.h"  // for rotations
+#include <jerror.h>
+#include <jpeglib.h>
 }
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/png.h b/tensorflow/core/platform/png.h
index 694959ec07d..dedb294843d 100644
--- a/tensorflow/core/platform/png.h
+++ b/tensorflow/core/platform/png.h
@@ -21,7 +21,7 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/png.h"
 #elif defined(PLATFORM_POSIX) && !defined(IS_MOBILE_PLATFORM)
-#include "libpng-1.2.53/png.h"
+#include <png.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index 8ce756fa7e6..75e300a37d2 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <fnmatch.h>
 #include <stdio.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
@@ -55,6 +56,10 @@ class PosixEnv : public Env {
 
   ~PosixEnv() override { LOG(FATAL) << "Env::Default() must not be destroyed"; }
 
+  bool MatchPath(const string& path, const string& pattern) override {
+    return fnmatch(pattern.c_str(), path.c_str(), FNM_PATHNAME) == 0;
+  }
+
   uint64 NowMicros() override {
     struct timeval tv;
     gettimeofday(&tv, NULL);
diff --git a/tensorflow/core/platform/posix/posix_file_system.h b/tensorflow/core/platform/posix/posix_file_system.h
index 38c3b0ae03a..07bb8c9a6ff 100644
--- a/tensorflow/core/platform/posix/posix_file_system.h
+++ b/tensorflow/core/platform/posix/posix_file_system.h
@@ -62,7 +62,9 @@ Status IOError(const string& context, int err_number);
 class LocalPosixFileSystem : public PosixFileSystem {
  public:
   string TranslateName(const string& name) const override {
-    return GetNameFromURI(name);
+    StringPiece scheme, host, path;
+    ParseURI(name, &scheme, &host, &path);
+    return path.ToString();
   }
 };
 
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index d3542060882..61b6fa0c84b 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h"
+
 #if defined(__ANDROID__) && defined(__ARM_ARCH_7A__) && (__ANDROID_API__ >= 21)
+
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
 #include <stdio.h>
@@ -25,7 +28,6 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h"
 
 namespace tensorflow {
 namespace profile_utils {
@@ -62,9 +64,7 @@ void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling(const bool enable) {
                    << "be scaled. (max = " << cpu0_scaling_max << ", min "
                    << cpu0_scaling_min << ")";
     }
-    return;
-  }
-  if (enable) {
+    ResetClockCycle();
     ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
   } else {
     ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
@@ -76,11 +76,11 @@ int64 AndroidArmV7ACpuUtilsHelper::CalculateCpuFrequency() {
 }
 
 void AndroidArmV7ACpuUtilsHelper::InitializeInternal() {
-  struct perf_event_attr pe;
+  perf_event_attr pe;
 
-  memset(&pe, 0, sizeof(struct perf_event_attr));
+  memset(&pe, 0, sizeof(perf_event_attr));
   pe.type = PERF_TYPE_HARDWARE;
-  pe.size = sizeof(struct perf_event_attr);
+  pe.size = sizeof(perf_event_attr);
   pe.config = PERF_COUNT_HW_CPU_CYCLES;
   pe.disabled = 1;
   pe.exclude_kernel = 1;
@@ -95,9 +95,10 @@ void AndroidArmV7ACpuUtilsHelper::InitializeInternal() {
   }
 }
 
-int AndroidArmV7ACpuUtilsHelper::OpenPerfEvent(
-    struct perf_event_attr *const hw_event, const pid_t pid, const int cpu,
-    const int group_fd, const unsigned long flags) {
+int AndroidArmV7ACpuUtilsHelper::OpenPerfEvent(perf_event_attr *const hw_event,
+                                               const pid_t pid, const int cpu,
+                                               const int group_fd,
+                                               const unsigned long flags) {
   const int ret =
       syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
   return ret;
@@ -128,7 +129,6 @@ int64 AndroidArmV7ACpuUtilsHelper::ReadCpuFrequencyFile(
 #else
 
 // Dummy implementations to avoid link error.
-#include "tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h"
 
 namespace tensorflow {
 namespace profile_utils {
@@ -136,7 +136,7 @@ namespace profile_utils {
 void AndroidArmV7ACpuUtilsHelper::ResetClockCycle() {}
 uint64 AndroidArmV7ACpuUtilsHelper::GetCurrentClockCycle() { return 1; }
 void AndroidArmV7ACpuUtilsHelper::EnableClockCycleProfiling(bool) {}
-int AndroidArmV7ACpuUtilsHelper::OpenPerfEvent(struct perf_event_attr *const,
+int AndroidArmV7ACpuUtilsHelper::OpenPerfEvent(perf_event_attr *const,
                                                const pid_t, const int,
                                                const int, const unsigned long) {
   return 0;
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index 969893cf385..c4fe2fc5b3c 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h"
 #include "tensorflow/core/platform/types.h"
 
+struct perf_event_attr;
+
 namespace tensorflow {
 namespace profile_utils {
 
@@ -39,7 +41,7 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
   void InitializeInternal();
 
   // syscall __NR_perf_event_open with arguments
-  int OpenPerfEvent(struct perf_event_attr *const hw_event, const pid_t pid,
+  int OpenPerfEvent(perf_event_attr *const hw_event, const pid_t pid,
                     const int cpu, const int group_fd,
                     const unsigned long flags);
 
diff --git a/tensorflow/core/platform/protobuf.h b/tensorflow/core/platform/protobuf.h
index 6aa89360ab5..27928f3b2cd 100644
--- a/tensorflow/core/platform/protobuf.h
+++ b/tensorflow/core/platform/protobuf.h
@@ -28,7 +28,7 @@ limitations under the License.
 // refer to all protobuf APIs.
 
 #if defined(PLATFORM_GOOGLE)
-#include "tensorflow/core/platform/google/build_config/protobuf.h"
+#include "tensorflow/core/platform/google/protobuf.h"
 #else
 #include "tensorflow/core/platform/default/protobuf.h"
 #endif
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 327710444ef..dc90c17bc04 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -112,6 +112,10 @@ message GraphOptions {
 
   // If true, transfer float values between processes as bfloat16.
   bool enable_bfloat16_sendrecv = 7;
+
+  // If > 0, record a timeline every this many steps.
+  // EXPERIMENTAL: This currently has no effect in MasterSession.
+  int32 timeline_step = 8;
 };
 
 message ThreadPoolOptionProto {
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index cd3dbf86a25..65e1e9fb7dc 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -67,6 +67,9 @@ message MetaGraphDef {
   // signature_def: Map from user supplied key for a signature to a single
   // SignatureDef.
   map<string, SignatureDef> signature_def = 5;
+
+  // Asset file def to be used with the defined graph.
+  repeated AssetFileDef asset_file_def = 6;
 }
 
 // CollectionDef should cover most collections.
@@ -266,3 +269,14 @@ message SignatureDef {
   // where a single graph computation may return multiple results.
   string method_name = 3;
 }
+
+// An asset file def for a single file or a set of sharded files with the same
+// name.
+message AssetFileDef {
+  // The tensor to bind the asset filename to.
+  TensorInfo tensor_info = 1;
+  // The filename within an assets directory. Note: does not include the path
+  // prefix, i.e. directories. For an asset at /tmp/path/vocab.txt, the filename
+  // would be "vocab.txt".
+  string filename = 2;
+}
diff --git a/tensorflow/core/protobuf/tensor_bundle.proto b/tensorflow/core/protobuf/tensor_bundle.proto
new file mode 100644
index 00000000000..80e87f14f94
--- /dev/null
+++ b/tensorflow/core/protobuf/tensor_bundle.proto
@@ -0,0 +1,64 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "TensorBundleProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.util";
+
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/tensor_slice.proto";
+import "tensorflow/core/framework/types.proto";
+import "tensorflow/core/framework/versions.proto";
+
+// Protos used in the tensor bundle module (tf/core/util/tensor_bundle/).
+
+// Special header that is associated with a bundle.
+//
+// TODO(zongheng,zhifengc): maybe in the future, we can add information about
+// which binary produced this checkpoint, timestamp, etc. Sometime, these can be
+// valuable debugging information. And if needed, these can be used as defensive
+// information ensuring reader (binary version) of the checkpoint and the writer
+// (binary version) must match within certain range, etc.
+message BundleHeaderProto {
+  // Number of data files in the bundle.
+  int32 num_shards = 1;
+
+  // An enum indicating the endianness of the platform that produced this
+  // bundle.  A bundle can only be read by a platform with matching endianness.
+  // Defaults to LITTLE, as most modern platforms are little-endian.
+  //
+  // Affects the binary tensor data bytes only, not the metadata in protobufs.
+  enum Endianness {
+    LITTLE = 0;
+    BIG = 1;
+  }
+  Endianness endianness = 2;
+
+  // Versioning of the tensor bundle format.
+  VersionDef version = 3;
+}
+
+// Describes the metadata related to a checkpointed tensor.
+message BundleEntryProto {
+  // The tensor dtype and shape.
+  DataType dtype = 1;
+  TensorShapeProto shape = 2;
+  // The binary content of the tensor lies in:
+  //   File "shard_id": bytes [offset, offset + size).
+  int32 shard_id = 3;
+  int64 offset = 4;
+  int64 size = 5;
+
+  // The CRC32C checksum of the tensor bytes.
+  fixed32 crc32c = 6;
+
+  // Iff present, this entry represents a partitioned tensor.  The previous
+  // fields are interpreted as follows:
+  //
+  //   "dtype", "shape": describe the full tensor.
+  //   "shard_id", "offset", "size", "crc32c": all IGNORED.
+  //      These information for each slice can be looked up in their own
+  //      BundleEntryProto, keyed by each "slice_name".
+  repeated TensorSliceProto slices = 7;
+}
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 8927a265444..2048126338a 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -47,6 +47,22 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
   return false;
 }
 
+bool ParseInt64Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
+                    tensorflow::int64* dst, bool* value_parsing_ok) {
+  *value_parsing_ok = true;
+  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+    char extra;
+    if (sscanf(arg.data(), "%lld%c", dst, &extra) != 1) {
+      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
+                 << ".";
+      *value_parsing_ok = false;
+    }
+    return true;
+  }
+
+  return false;
+}
+
 bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    bool* dst, bool* value_parsing_ok) {
   *value_parsing_ok = true;
@@ -78,6 +94,9 @@ bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 Flag::Flag(const char* name, tensorflow::int32* dst)
     : name_(name), type_(TYPE_INT), int_value_(dst) {}
 
+Flag::Flag(const char* name, tensorflow::int64* dst)
+    : name_(name), type_(TYPE_INT64), int64_value_(dst) {}
+
 Flag::Flag(const char* name, bool* dst)
     : name_(name), type_(TYPE_BOOL), bool_value_(dst) {}
 
@@ -88,6 +107,8 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
   bool result = false;
   if (type_ == TYPE_INT) {
     result = ParseInt32Flag(arg, name_, int_value_, value_parsing_ok);
+  } else if (type_ == TYPE_INT64) {
+    result = ParseInt64Flag(arg, name_, int64_value_, value_parsing_ok);
   } else if (type_ == TYPE_BOOL) {
     result = ParseBoolFlag(arg, name_, bool_value_, value_parsing_ok);
   } else if (type_ == TYPE_STRING) {
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index 7e74240e538..9297fb066d1 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -49,6 +49,7 @@ namespace tensorflow {
 class Flag {
  public:
   Flag(const char* name, int32* dst1);
+  Flag(const char* name, int64* dst1);
   Flag(const char* name, bool* dst);
   Flag(const char* name, string* dst);
 
@@ -56,8 +57,9 @@ class Flag {
 
  private:
   string name_;
-  enum { TYPE_INT, TYPE_BOOL, TYPE_STRING } type_;
+  enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING } type_;
   int* int_value_;
+  int64* int64_value_;
   bool* bool_value_;
   string* string_value_;
 };
diff --git a/tensorflow/core/util/command_line_flags_test.cc b/tensorflow/core/util/command_line_flags_test.cc
index 1cdddf363db..bc38fff8fde 100644
--- a/tensorflow/core/util/command_line_flags_test.cc
+++ b/tensorflow/core/util/command_line_flags_test.cc
@@ -33,19 +33,21 @@ std::vector<char*> CharPointerVectorFromStrings(
 
 TEST(CommandLineFlagsTest, BasicUsage) {
   int some_int = 10;
+  int64 some_int64 = 21474836470;  // max int32 is 2147483647
   bool some_switch = false;
   tensorflow::string some_name = "something";
-  int argc = 4;
+  int argc = 5;
   std::vector<tensorflow::string> argv_strings = {
-      "program_name", "--some_int=20", "--some_switch",
-      "--some_name=somethingelse"};
+      "program_name", "--some_int=20", "--some_int64=214748364700",
+      "--some_switch", "--some_name=somethingelse"};
   std::vector<char*> argv_array = CharPointerVectorFromStrings(argv_strings);
-  bool parsed_ok =
-      ParseFlags(&argc, argv_array.data(), {Flag("some_int", &some_int),
-                                            Flag("some_switch", &some_switch),
-                                            Flag("some_name", &some_name)});
+  bool parsed_ok = ParseFlags(
+      &argc, argv_array.data(),
+      {Flag("some_int", &some_int), Flag("some_int64", &some_int64),
+       Flag("some_switch", &some_switch), Flag("some_name", &some_name)});
   EXPECT_EQ(true, parsed_ok);
   EXPECT_EQ(20, some_int);
+  EXPECT_EQ(214748364700, some_int64);
   EXPECT_EQ(true, some_switch);
   EXPECT_EQ("somethingelse", some_name);
   EXPECT_EQ(argc, 1);
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
new file mode 100644
index 00000000000..d7e94d47dc8
--- /dev/null
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -0,0 +1,53 @@
+# Description:
+# Tensor bundle: a module to efficiently serialize and deserialize tensors.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "tensor_bundle",
+    srcs = ["tensor_bundle.cc"],
+    hdrs = ["tensor_bundle.h"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_test(
+    name = "tensor_bundle_test",
+    srcs = ["tensor_bundle_test.cc"],
+    deps = [
+        ":tensor_bundle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.  These must be at the end for syncrepo.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
new file mode 100644
index 00000000000..26a050743e6
--- /dev/null
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -0,0 +1,842 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/versions.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/hash/crc32c.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/io/table_builder.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
+#include "tensorflow/core/util/tensor_slice_util.h"
+
+namespace tensorflow {
+
+// Versioning of the tensor bundle format.
+const int kTensorBundleMinProducer = 0;
+const int kTensorBundleMinConsumer = 0;
+const int kTensorBundleVersion = 1;
+
+// Key to the special BundleHeaderProto entry.  Do not change this, as clients
+// can make the assumption that the header is always the first entry in the
+// bundle.
+const char* const kHeaderEntryKey = "";
+
+namespace {
+
+// Reads "num_elements" string elements from file[offset, offset+size) into the
+// length-N "destination".  Discards the original content of "destination".
+//
+// Checksums the string lengths (as restored uint32, not varint32 bytes) and
+// string bytes, and stores it into "actual_crc32c".
+Status ReadStringTensor(io::InputBuffer* buffered_file, size_t num_elements,
+                        size_t offset, size_t size, string* destination,
+                        uint32* actual_crc32c) {
+  if (size == 0) return Status::OK();
+  CHECK_GT(size, 0);
+
+  // Reads "num_elements" varint32's from "buffered_file".
+  TF_RETURN_IF_ERROR(buffered_file->Seek(offset));
+  std::vector<uint32> string_lengths(num_elements);
+  for (size_t i = 0; i < num_elements; ++i) {
+    TF_RETURN_IF_ERROR(buffered_file->ReadVarint32(&string_lengths[i]));
+  }
+  if (offset + size < buffered_file->Tell()) {
+    return errors::DataLoss("String lengths longer than expected offset ",
+                            offset + size);
+  }
+  *actual_crc32c =
+      crc32c::Value(reinterpret_cast<const char*>(string_lengths.data()),
+                    sizeof(uint32) * num_elements);
+
+  // Reads the length-checksum.
+  uint32 length_checksum = 0;
+  size_t unused_bytes_read = 0;
+  TF_RETURN_IF_ERROR(buffered_file->ReadNBytes(
+      sizeof(uint32), reinterpret_cast<char*>(&length_checksum),
+      &unused_bytes_read));
+  if (crc32c::Unmask(length_checksum) != *actual_crc32c) {
+    return errors::DataLoss(
+        "The length checksum does not match: expected ",
+        strings::Printf("%08u", crc32c::Unmask(length_checksum)),
+        " but actual is ", strings::Printf("%08u", *actual_crc32c));
+  }
+  *actual_crc32c =
+      crc32c::Extend(*actual_crc32c, reinterpret_cast<char*>(&length_checksum),
+                     sizeof(uint32));
+
+  // Reads the actual string bytes.
+  for (size_t i = 0; i < num_elements; ++i) {
+    const uint32 string_length = string_lengths[i];
+    string* buffer = &destination[i];
+
+    buffer->resize(string_length);
+    size_t bytes_read = 0;
+    TF_RETURN_IF_ERROR(
+        buffered_file->ReadNBytes(string_length, &(*buffer)[0], &bytes_read));
+    *actual_crc32c = crc32c::Extend(*actual_crc32c, buffer->data(), bytes_read);
+  }
+  return Status::OK();
+}
+
+char* GetBackingBuffer(const Tensor& val) {
+  CHECK(DataTypeCanUseMemcpy(val.dtype()));
+  return const_cast<char*>(val.tensor_data().data());
+}
+
+string* GetStringBackingBuffer(const Tensor& val) {
+  CHECK_EQ(DT_STRING, val.dtype());
+  return const_cast<string*>(val.flat<string>().data());
+}
+
+Status ParseEntryProto(StringPiece key, StringPiece value,
+                       protobuf::MessageLite* out) {
+  if (!out->ParseFromArray(value.data(), value.size())) {
+    return errors::DataLoss("Entry for key ", key, " not parseable.");
+  }
+  return Status::OK();
+}
+
+// Serializes the data bytes of the non-string tensor "val".  Discards the
+// original content of "bytes_written", and on OK updates it with number of
+// bytes written.
+// REQUIRES: val.dtype() != DT_STRING
+Status WriteTensor(const Tensor& val, FileOutputBuffer* out,
+                   size_t* bytes_written) {
+  DCHECK_NE(val.dtype(), DT_STRING);
+  *bytes_written = val.TotalBytes();
+  char* buf = GetBackingBuffer(val);
+  VLOG(1) << "Appending " << *bytes_written << " bytes to file";
+  return out->Append(StringPiece(buf, *bytes_written));
+}
+
+// Serializes string tensor "val".  "bytes_written" is treated in the same
+// fashion as WriteTensor().
+//
+// Checksums all bytes written and stores it into "crc32c".
+// REQUIRES: val.dtype() == DT_STRING
+Status WriteStringTensor(const Tensor& val, FileOutputBuffer* out,
+                         size_t* bytes_written, uint32* crc32c) {
+  // On-disk format:
+  //   [varint32 len0]..[varint32 lenL][4 byte cksum on lengths][string bytes]
+  // Var "crc32c" checksums the string lengths (as uint32, not varint32 bytes),
+  // the length-checksum, and all the string bytes.
+  DCHECK_EQ(val.dtype(), DT_STRING);
+  const string* strings = GetStringBackingBuffer(val);
+
+  // Writes the varint lengths.
+  string lengths;
+  lengths.reserve(val.NumElements());  // At least 1 byte per element.
+  *crc32c = 0;
+  for (int64 i = 0; i < val.NumElements(); ++i) {
+    const string* elem = &strings[i];
+    DCHECK_EQ(elem->size(), static_cast<uint32>(elem->size()));
+    const uint32 elem_size = static_cast<uint32>(elem->size());
+
+    core::PutVarint32(&lengths, elem_size);
+    *crc32c = crc32c::Extend(*crc32c, reinterpret_cast<const char*>(&elem_size),
+                             sizeof(uint32));
+  }
+  TF_RETURN_IF_ERROR(out->Append(lengths));
+  *bytes_written = lengths.size();
+
+  // Writes the length checksum.
+  const uint32 length_checksum = crc32c::Mask(*crc32c);
+  TF_RETURN_IF_ERROR(out->Append(StringPiece(
+      reinterpret_cast<const char*>(&length_checksum), sizeof(uint32))));
+  *crc32c = crc32c::Extend(
+      *crc32c, reinterpret_cast<const char*>(&length_checksum), sizeof(uint32));
+  *bytes_written += sizeof(uint32);
+
+  // Writes all the string bytes out.
+  for (int64 i = 0; i < val.NumElements(); ++i) {
+    const string* string = &strings[i];
+    TF_RETURN_IF_ERROR(out->Append(*string));
+    *bytes_written += string->size();
+    *crc32c = crc32c::Extend(*crc32c, string->data(), string->size());
+  }
+  return Status::OK();
+}
+
+// Reads file[offset:offset+size) into destination[0:size).  Each Read() copies
+// at most "buffer_size" bytes.
+//
+// REQUIRES: "file" contains at least "offset + size" bytes.
+// REQUIRES: "destination" contains at least "size" bytes.
+// On error, "destination" may contain garbage.
+Status ReadInputByChunk(const RandomAccessFile* file, size_t offset,
+                        size_t size, size_t buffer_size, char* destination) {
+  if (size == 0) return Status::OK();
+  CHECK_GT(size, 0);
+  CHECK_GT(buffer_size, 0);
+  size_t bytes_read = 0;
+  StringPiece result;
+
+  while (bytes_read < size) {
+    const size_t desired_bytes = std::min(buffer_size, size - bytes_read);
+    Status status = file->Read(offset + bytes_read, desired_bytes, &result,
+                               destination + bytes_read);
+
+    if (!status.ok()) {
+      return status;
+    } else if (result.size() != desired_bytes) {
+      return errors::DataLoss("Requested ", desired_bytes, " bytes but read ",
+                              result.size(), " bytes.");
+    } else if (result.data() == destination + bytes_read) {
+      // Data is already in the correct location.
+    } else {
+      // memmove is guaranteed to handle overlaps safely (although the src and
+      // dst buffers should not overlap for this function).
+      memmove(destination + bytes_read, result.data(), result.size());
+    }
+    bytes_read += result.size();
+  }
+  CHECK_EQ(bytes_read, size);
+  return Status::OK();
+}
+
+}  // namespace
+
+string DataFilename(const string& prefix, int32 shard_id, int32 num_shards) {
+  DCHECK_GT(num_shards, 0);
+  DCHECK_LT(shard_id, num_shards);
+  return strings::Printf("%s.data-%05d-of-%05d", prefix.c_str(), shard_id,
+                         num_shards);
+}
+
+string MetaFilename(const string& prefix) {
+  return strings::Printf("%s.index", prefix.c_str());
+}
+
+BundleWriter::BundleWriter(Env* env, const string& prefix)
+    : env_(env), prefix_(prefix), out_(nullptr), size_(0) {
+  status_ =
+      env_->CreateDir(io::Dirname(prefix_).ToString());  // Ignores errors.
+  const string filename = DataFilename(prefix_, 0, 1);
+  std::unique_ptr<WritableFile> wrapper;
+  status_ = env_->NewWritableFile(filename, &wrapper);
+  if (!status_.ok()) return;
+  out_ = std::unique_ptr<FileOutputBuffer>(
+      new FileOutputBuffer(wrapper.release(), 8 << 20 /* 8MB write buffer */));
+
+  VLOG(1) << "Writing to file " << filename;
+}
+
+BundleWriter::~BundleWriter() { CHECK(out_ == nullptr); }
+
+Status BundleWriter::Add(const string& key, const Tensor& val) {
+  CHECK_NE(key, kHeaderEntryKey);
+  if (!status_.ok()) return status_;
+  if (entries_.find(key) != entries_.end()) {
+    status_ = errors::InvalidArgument("Adding duplicate key: ", key);
+    return status_;
+  }
+
+  BundleEntryProto* entry = &entries_[key];
+  entry->set_dtype(val.dtype());
+  val.shape().AsProto(entry->mutable_shape());
+  entry->set_shard_id(0);
+  entry->set_offset(size_);
+
+  // Updates the data file.
+  size_t data_bytes_written = 0;
+  uint32 crc32c = 0;
+  out_->clear_crc32c();
+  if (val.dtype() != DT_STRING) {
+    status_ = WriteTensor(val, out_.get(), &data_bytes_written);
+    crc32c = out_->crc32c();
+  } else {
+    status_ = WriteStringTensor(val, out_.get(), &data_bytes_written, &crc32c);
+  }
+
+  if (status_.ok()) {
+    entry->set_size(data_bytes_written);
+    entry->set_crc32c(crc32c::Mask(crc32c));
+    size_ += data_bytes_written;
+  }
+  return status_;
+}
+
+Status BundleWriter::AddSlice(const string& full_tensor_key,
+                              const TensorShape& full_tensor_shape,
+                              const TensorSlice& slice_spec,
+                              const Tensor& slice_tensor) {
+  CHECK_NE(full_tensor_key, kHeaderEntryKey);
+  if (!status_.ok()) return status_;
+
+  // Inserts/updates the full tensor's metadata entry.
+  //
+  // In the case of a sharded save, MergeBundles() is responsible for merging
+  // the "slices" field of multiple metadata entries corresponding to the same
+  // full tensor.
+  BundleEntryProto* full_entry = &entries_[full_tensor_key];
+  if (full_entry->dtype() != DT_INVALID) {
+    CHECK_EQ(full_entry->dtype(), slice_tensor.dtype());
+  }
+  if (full_entry->has_shape()) {
+    CHECK(TensorShape(full_entry->shape()) == full_tensor_shape);
+  }
+
+  // Populates dtype, shape, and slices.  Intentionally leaving out shard_id and
+  // offset, which do not make sense for this full tensor entry.
+  full_entry->set_dtype(slice_tensor.dtype());
+  full_tensor_shape.AsProto(full_entry->mutable_shape());
+  TensorSliceProto* slice_proto = full_entry->add_slices();
+  slice_spec.AsProto(slice_proto);
+
+  // The slice itself is handled by a regular Add(), which includes adding its
+  // own metadata entry, and writing out the slice's values.
+  const string slice_name =
+      checkpoint::EncodeTensorNameSlice(full_tensor_key, slice_spec);
+  status_ = Add(slice_name, slice_tensor);
+  return status_;
+}
+
+// TODO(zongheng): on metadata write failure or !status_.ok(), consider removing
+// the orphaned data file.
+Status BundleWriter::Finish() {
+  if (out_) {
+    status_.Update(out_->Close());
+    out_ = nullptr;
+  }
+  if (!status_.ok()) return status_;
+  // Build key -> BundleEntryProto table.
+  std::unique_ptr<WritableFile> file;
+  status_ = env_->NewWritableFile(MetaFilename(prefix_), &file);
+  if (!status_.ok()) return status_;
+  {
+    table::TableBuilder builder(table::Options(), file.get());
+    // Header entry.
+    BundleHeaderProto header;
+    header.set_num_shards(1);
+    header.set_endianness(BundleHeaderProto::LITTLE);
+    if (!port::kLittleEndian) header.set_endianness(BundleHeaderProto::BIG);
+    VersionDef* version = header.mutable_version();
+    version->set_producer(kTensorBundleVersion);
+    version->set_min_consumer(kTensorBundleMinConsumer);
+
+    builder.Add(kHeaderEntryKey, header.SerializeAsString());
+
+    // All others.
+    for (const auto& p : entries_) {
+      builder.Add(p.first, p.second.SerializeAsString());
+    }
+    status_ = builder.Finish();
+  }
+  status_.Update(file->Close());
+  if (!status_.ok()) return status_;
+  status_ = errors::Internal("BundleWriter is closed");
+  return Status::OK();
+}
+
+// Merging tensor bundles.
+
+// Accumulator of metadata states during a merge.
+struct MergeState {
+  // Accumulated from the header entries.
+  int num_shards = 0;
+
+  // Derives "endianness" and "version" from the first bundle merged (hence the
+  // "seen_first_bundle" guard).  The two fields must be the same for all
+  // bundles in a merge.
+  bool seen_first_bundle = false;
+  BundleHeaderProto_Endianness endianness;
+  VersionDef version;
+
+  // Tensor key -> BundleEntryProto.
+  std::map<string, BundleEntryProto> entries;
+  // Data file path -> new shard id in the final merged bundle.
+  std::unordered_map<string, int32> shard_ids;
+};
+
+// Merges entries of "prefix" into the accumulator state "merge".
+// Returns OK iff the merge succeeds.
+static Status MergeOneBundle(Env* env, const string& prefix,
+                             MergeState* merge_state) {
+  VLOG(1) << "Merging bundle:" << prefix;
+  const string& filename = MetaFilename(prefix);
+  uint64 file_size;
+  TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
+  std::unique_ptr<RandomAccessFile> file;
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
+
+  table::Table* table = nullptr;
+  TF_RETURN_IF_ERROR(
+      table::Table::Open(table::Options(), file.get(), file_size, &table));
+  std::unique_ptr<table::Table> table_deleter(table);
+  std::unique_ptr<table::Iterator> iter(table->NewIterator());
+
+  int num_shards;
+  // Process header.
+  {
+    iter->Seek(kHeaderEntryKey);
+    CHECK(iter->Valid());
+    BundleHeaderProto header;
+    TF_CHECK_OK(ParseEntryProto(iter->key(), iter->value(), &header));
+    CHECK_GE(header.num_shards(), 0);
+
+    merge_state->num_shards += header.num_shards();
+    if (!merge_state->seen_first_bundle) {
+      merge_state->seen_first_bundle = true;
+      merge_state->endianness = header.endianness();
+      merge_state->version = header.version();
+    } else {
+      // Validates "endianness".
+      if (merge_state->endianness != header.endianness()) {
+        return errors::InvalidArgument(
+            "Merging bundles with conflicting endianness; inputs corrupted?");
+      }
+      // Validates "version".
+      string curr_version, merge_version;
+      header.version().SerializeToString(&curr_version);
+      merge_state->version.SerializeToString(&merge_version);
+      if (curr_version != merge_version) {
+        return errors::InvalidArgument(
+            "Merging bundles with different format versions: merged ",
+            merge_version, " vs. curr ", curr_version);
+      }
+    }
+    num_shards = header.num_shards();
+    iter->Next();
+  }
+
+  // Loops through the non-header to-merge entries.
+  BundleEntryProto to_merge_entry;
+  for (; iter->Valid(); iter->Next()) {
+    const string key = iter->key().ToString();
+    const auto entry_iter = merge_state->entries.find(key);
+
+    // Illegal: the duplicated entry is a non-slice tensor.
+    if (entry_iter != merge_state->entries.end() &&
+        entry_iter->second.slices().empty()) {
+      return errors::InvalidArgument("Duplicate tensor keyed by ", key,
+                                     " encountered, when merging prefix: ",
+                                     prefix);
+    }
+
+    TF_RETURN_IF_ERROR(
+        ParseEntryProto(iter->key(), iter->value(), &to_merge_entry));
+
+    // The duplicated entry holds metadata for a sliced full tensor.
+    // Allows the duplication and merges "slices".
+    if (entry_iter != merge_state->entries.end()) {
+      BundleEntryProto& existing_entry = entry_iter->second;
+      if (to_merge_entry.slices().empty()) {
+        return errors::Internal(
+            "Duplicate tensor keyed by ", key,
+            "; attempting to merge in a non-slice bundle entry");
+      }
+      // Only needs merge the "slices" field (and validate dtype/shape).
+      for (int i = 0; i < to_merge_entry.slices_size(); ++i) {
+        TensorSliceProto* slot = existing_entry.add_slices();
+        *slot = to_merge_entry.slices(i);
+      }
+      CHECK_EQ(existing_entry.dtype(), to_merge_entry.dtype());
+      CHECK(TensorShape(existing_entry.shape()) ==
+            TensorShape(to_merge_entry.shape()));
+      continue;
+    }
+
+    // Key doesn't duplicate: a fresh tensor/slice entry.
+    auto result = merge_state->shard_ids.insert(
+        {DataFilename(prefix, to_merge_entry.shard_id(), num_shards),
+         merge_state->shard_ids.size()});
+    to_merge_entry.set_shard_id(result.first->second);
+    merge_state->entries[key] = to_merge_entry;
+  }
+  return Status::OK();
+}
+
+Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
+                    const string& merged_prefix) {
+  // Merges all metadata tables.
+  // TODO(zhifengc): KeyValue sorter if it becomes too big.
+  MergeState merge;
+  env->CreateDir(io::Dirname(merged_prefix).ToString());  // Ignores errors.
+  for (int i = 0; i < prefixes.size(); ++i) {
+    TF_RETURN_IF_ERROR(MergeOneBundle(env, prefixes[i], &merge));
+  }
+
+  // Renames data files to contain the merged bundle prefix.
+  for (const auto& p : merge.shard_ids) {
+    VLOG(1) << "Renaming " << p.first << " to "
+            << DataFilename(merged_prefix, p.second, merge.shard_ids.size());
+    TF_RETURN_IF_ERROR(env->RenameFile(
+        p.first,
+        DataFilename(merged_prefix, p.second, merge.shard_ids.size())));
+  }
+
+  // Writes the final metadata table under the merged prefix.
+  std::unique_ptr<WritableFile> merged_metadata;
+  TF_RETURN_IF_ERROR(
+      env->NewWritableFile(MetaFilename(merged_prefix), &merged_metadata));
+  Status status;
+  {
+    table::TableBuilder builder(table::Options(), merged_metadata.get());
+    // Header entry.
+    BundleHeaderProto header;
+    header.set_num_shards(merge.num_shards);
+    header.set_endianness(merge.endianness);
+    *header.mutable_version() = merge.version;
+    builder.Add(kHeaderEntryKey, header.SerializeAsString());
+    // All others.
+    for (const auto& p : merge.entries) {
+      builder.Add(p.first, p.second.SerializeAsString());
+    }
+    status = builder.Finish();
+  }
+  status.Update(merged_metadata->Close());
+  if (!status.ok()) return status;
+  VLOG(1) << "Merged bundles to:" << merged_prefix;
+
+  // Cleanup: best effort based and ignores errors.
+  for (const string& prefix : prefixes) {
+    env->DeleteFile(MetaFilename(prefix));
+  }
+  return status;
+}
+
+// Interface for reading a tensor bundle.
+
+BundleReader::BundleReader(Env* env, const string& prefix)
+    : env_(env),
+      prefix_(prefix),
+      metadata_(nullptr),
+      table_(nullptr),
+      iter_(nullptr) {
+  const string& filename = MetaFilename(prefix_);
+  uint64 file_size;
+  status_ = env_->GetFileSize(filename, &file_size);
+  if (!status_.ok()) return;
+
+  // Opens the metadata table.
+  std::unique_ptr<RandomAccessFile> wrapper;
+  status_ = env_->NewRandomAccessFile(filename, &wrapper);
+  if (!status_.ok()) return;
+  metadata_ = wrapper.release();
+  status_ = table::Table::Open(table::Options(), metadata_, file_size, &table_);
+  if (!status_.ok()) return;
+  iter_ = table_->NewIterator();
+
+  // Reads "num_shards_" from the first entry.
+  iter_->Seek(kHeaderEntryKey);
+  CHECK(iter_->Valid());
+  BundleHeaderProto header;
+  TF_CHECK_OK(ParseEntryProto(iter_->key(), iter_->value(), &header));
+  num_shards_ = header.num_shards();
+  if ((header.endianness() == BundleHeaderProto::BIG && port::kLittleEndian) ||
+      (header.endianness() == BundleHeaderProto::LITTLE &&
+       !port::kLittleEndian)) {
+    status_ = errors::Unimplemented(
+        "Reading a bundle with different endianness from the reader");
+    return;
+  }
+  status_ = CheckVersions(header.version(), kTensorBundleVersion,
+                          kTensorBundleMinProducer, "Checkpoint", "checkpoint");
+}
+
+BundleReader::~BundleReader() {
+  delete metadata_;
+  delete iter_;
+  delete table_;
+  gtl::STLDeleteValues(&data_);
+  gtl::STLDeleteValues(&tensor_slices_);
+}
+
+Status BundleReader::GetBundleEntryProto(const string& key,
+                                         BundleEntryProto* entry) {
+  entry->Clear();
+  TF_CHECK_OK(status_);
+  Seek(key);
+  if (!iter_->Valid() || iter_->key() != key) {
+    return errors::NotFound("Key ", key, " not found in checkpoint");
+  }
+
+  BundleEntryProto entry_copy;
+  TF_RETURN_IF_ERROR(
+      ParseEntryProto(iter_->key(), iter_->value(), &entry_copy));
+  if (!TensorShape::IsValid(entry_copy.shape())) {
+    return errors::DataLoss("Invaid tensor shape: ", key, " ",
+                            entry_copy.shape().ShortDebugString());
+  }
+
+  *entry = entry_copy;
+  return Status::OK();
+}
+
+Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
+  Tensor* ret = val;
+  const TensorShape stored_shape(TensorShape(entry.shape()));
+  if (val->NumElements() == 0) {
+    ret = new Tensor(entry.dtype(), stored_shape);
+  }
+
+  // Validates the "size" field.
+  if (entry.dtype() != DT_STRING) {
+    if (entry.size() != ret->TotalBytes()) {
+      return errors::DataLoss("Invalid size in bundle entry: key ", key(),
+                              "; stored size ", entry.size(),
+                              "; expected size ", ret->TotalBytes());
+    }
+  } else {
+    // Relaxes the check for string tensors as follows:
+    //   entry.size() == bytes(varint lengths) + bytes(data)
+    //                >= NumElems + bytes(data), since size bytes(varint) >= 1.
+    //   TotalBytes() == sizeof(string) * NumElems + bytes(data)
+    // Since we don't know bytes(varint lengths), we just check an inequality.
+    const size_t lower_bound = ret->NumElements() + ret->TotalBytes() -
+                               sizeof(string) * ret->NumElements();
+    if (entry.size() < lower_bound) {
+      return errors::DataLoss("Invalid size in bundle entry: key ", key(),
+                              "; stored size ", entry.size(),
+                              "; expected size is at least ", lower_bound);
+    }
+  }
+
+  // Open the data file if not opened it.
+  std::unique_ptr<RandomAccessFile> file = nullptr;
+  std::unique_ptr<io::InputBuffer> buffered_file(data_[entry.shard_id()]);
+  if (buffered_file == nullptr) {
+    TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(
+        DataFilename(prefix_, entry.shard_id(), num_shards_), &file));
+    buffered_file.reset(
+        new io::InputBuffer(file.get(), 256 << 10 /* 256KB buffer */));
+  }
+  CHECK(buffered_file != nullptr);
+
+  TF_RETURN_IF_ERROR(buffered_file->Seek(entry.offset()));
+  uint32 actual_crc32c = 0;
+  if (DataTypeCanUseMemcpy(entry.dtype())) {
+    // Important: ReadInputByChunk() bounds the readahead as min(buffer, actual
+    // bytes needed).  This is critical when reading small tensors, so we don't
+    // rely on io::InputBuffer's blind buffering here.
+    char* backing_buffer = const_cast<char*>((ret->tensor_data().data()));
+    TF_RETURN_IF_ERROR(ReadInputByChunk(buffered_file->file(), entry.offset(),
+                                        entry.size(), 8 << 20 /* 8MB buffer */,
+                                        backing_buffer));
+    actual_crc32c = crc32c::Value(backing_buffer, entry.size());
+  } else {
+    // Relies on io::InputBuffer's buffering, because we issue many neighboring
+    // reads for a single string tensor.
+    TF_RETURN_IF_ERROR(ReadStringTensor(
+        buffered_file.get(), ret->NumElements(), entry.offset(), entry.size(),
+        GetStringBackingBuffer(*ret), &actual_crc32c));
+  }
+  if (crc32c::Unmask(entry.crc32c()) != actual_crc32c) {
+    return errors::DataLoss(
+        "Checksum does not match: stored ",
+        strings::Printf("%08u", crc32c::Unmask(entry.crc32c())),
+        " vs. calculated on the restored bytes ", actual_crc32c);
+  }
+
+  *val = *ret;
+  if (ret != val) delete ret;
+  return Status::OK();
+}
+
+Status BundleReader::Lookup(const string& key, Tensor* val) {
+  BundleEntryProto entry;
+  TF_RETURN_IF_ERROR(GetBundleEntryProto(key, &entry));
+
+  if (entry.slices().empty()) {
+    return GetValue(entry, val);
+  } else {
+    return GetSliceValue(
+        key, entry,
+        /* a full slice */ TensorSlice(TensorShape(entry.shape()).dims()), val);
+  }
+}
+
+Status BundleReader::LookupSlice(const string& full_tensor_key,
+                                 const TensorSlice& slice_spec, Tensor* val) {
+  BundleEntryProto entry;
+  TF_RETURN_IF_ERROR(GetBundleEntryProto(full_tensor_key, &entry));
+  return GetSliceValue(full_tensor_key, entry, slice_spec, val);
+}
+
+Status BundleReader::GetSliceValue(const string& full_tensor_key,
+                                   const BundleEntryProto& full_tensor_entry,
+                                   const TensorSlice& slice_spec, Tensor* val) {
+  using checkpoint::TensorSliceSet;
+  using checkpoint::RegisterTensorSlice;
+  DCHECK_GE(full_tensor_entry.slices_size(), 0);
+
+  const TensorShape full_shape(TensorShape(full_tensor_entry.shape()));
+  std::vector<std::pair<TensorSlice, string>> details;
+  const TensorSliceSet* tss =
+      gtl::FindPtrOrNull(tensor_slices_, full_tensor_key);
+
+  // Populates the "full tensor key -> TensorSliceSet" cache.
+  if (tss == nullptr) {
+    if (full_tensor_entry.slices().empty()) {
+      // Special case: a writer has saved a tensor fully, but the reader wants
+      // to read in slices.  We therefore register the full slice on-demand here
+      // without further complicating the on-disk bundle format.
+      RegisterTensorSlice(
+          full_tensor_key, full_shape, full_tensor_entry.dtype(), /* tag */ "",
+          /* full slice */ TensorSlice(full_shape.dims()), &tensor_slices_);
+    }
+    for (const TensorSliceProto& slice : full_tensor_entry.slices()) {
+      RegisterTensorSlice(full_tensor_key, full_shape,
+                          full_tensor_entry.dtype(),
+                          /* tag */ "", TensorSlice(slice), &tensor_slices_);
+    }
+    tss = gtl::FindPtrOrNull(tensor_slices_, full_tensor_key);
+    CHECK_NE(tss, nullptr);
+  }
+  if (!tss->QueryMeta(slice_spec, &details)) {
+    return errors::InvalidArgument(
+        "Does not have sufficient slices for partitioned tensor ",
+        full_tensor_key, " to restore in slice_spec: ",
+        slice_spec.DebugString());
+  }
+
+  // The union of the slices in "details" covers "slice_spec".  Performs the
+  // copies from each.
+  BundleEntryProto stored_slice_entry = full_tensor_entry;
+  for (const auto& slice_tag_pair : details) {
+    // Seeks for the stored slice.
+    const TensorSlice& stored_slice = slice_tag_pair.first;
+
+    // We already have the entry for the full tensor, so don't query again if
+    // the slice is full.
+    if (!stored_slice.IsFull()) {
+      const string& encoded_stored_slice_name =
+          checkpoint::EncodeTensorNameSlice(full_tensor_key, stored_slice);
+      status_ =
+          GetBundleEntryProto(encoded_stored_slice_name, &stored_slice_entry);
+      if (!status_.ok()) return status_;
+    }
+
+    // TODO(zongheng): should we take an OpKernelContext, so that we can call
+    // allocate_temp()?  Note that without major refactorings to Saver, it's
+    // hard for the caller of the tensor bundle module to allocate these
+    // precisely-shaped scratch storage.
+    // TODO(zongheng): implement an important optimization: if the stored slice
+    // is a subset of the to-restore slice, directly read the stored slice into
+    // the latter's already-allocated backing buffer.
+
+    // Optimization for the common case: stored slice == to-restore slice.
+    if (stored_slice == slice_spec) {
+      VLOG(1) << "Optimized for common case: directly copying into "
+                 "pre-allocated buffer; spec: "
+              << slice_spec.DebugString();
+      status_ = GetValue(stored_slice_entry, val);
+      return status_;
+    }
+
+    Tensor stored_slice_tensor(stored_slice_entry.dtype(),
+                               TensorShape(stored_slice_entry.shape()));
+    status_ = GetValue(stored_slice_entry, &stored_slice_tensor);
+    if (!status_.ok()) return status_;
+
+    // Copies the intersection over.
+    const DataType common_dtype = full_tensor_entry.dtype();
+    switch (common_dtype) {
+#define HANDLE_COPY(T)                                                 \
+  case DataTypeToEnum<T>::value:                                       \
+    CHECK(CopyDataFromTensorSliceToTensorSlice(                        \
+        full_shape, stored_slice, slice_spec,                          \
+        stored_slice_tensor.flat<T>().data(), val->flat<T>().data())); \
+    break;
+
+      HANDLE_COPY(float)
+      HANDLE_COPY(double)
+      HANDLE_COPY(int32)
+      HANDLE_COPY(uint8)
+      HANDLE_COPY(int16)
+      HANDLE_COPY(int8)
+      HANDLE_COPY(complex64)
+      HANDLE_COPY(complex128)
+      HANDLE_COPY(int64)
+      HANDLE_COPY(bool)
+      HANDLE_COPY(qint32)
+      HANDLE_COPY(quint8)
+      HANDLE_COPY(qint8)
+      default:
+        return errors::InvalidArgument("Dtype ", DataTypeString(common_dtype),
+                                       " not supported.");
+    }
+#undef HANDLE_COPY
+  }
+  return Status::OK();
+}
+
+Status BundleReader::LookupTensorShape(const string& key, TensorShape* shape) {
+  BundleEntryProto entry;
+  TF_RETURN_IF_ERROR(GetBundleEntryProto(key, &entry));
+
+  *shape = TensorShape(entry.shape());
+  return Status::OK();
+}
+
+FileOutputBuffer::~FileOutputBuffer() { delete file_; }
+
+Status FileOutputBuffer::Append(StringPiece data) {
+  // In the below, it is critical to calculate the checksum on the actually
+  // copied bytes, not the source bytes.  This is because "data" typically
+  // points to tensor buffers, which may be concurrently written.
+  if (data.size() + position_ <= buffer_size_) {
+    // Can fit into the current buffer.
+    memcpy(&buffer_[position_], data.data(), data.size());
+    crc32c_ = crc32c::Extend(crc32c_, &buffer_[position_], data.size());
+  } else if (data.size() <= buffer_size_) {
+    // Cannot fit, but can fit after flushing.
+    TF_RETURN_IF_ERROR(Flush());
+    memcpy(&buffer_[0], data.data(), data.size());
+    crc32c_ = crc32c::Extend(crc32c_, &buffer_[0], data.size());
+  } else {
+    // Cannot fit even after flushing.  So we break down "data" by chunk, and
+    // flush/checksum each chunk.
+    TF_RETURN_IF_ERROR(Flush());
+    for (size_t i = 0; i < data.size(); i += buffer_size_) {
+      const size_t nbytes = std::min(data.size() - i, buffer_size_);
+      memcpy(&buffer_[0], data.data() + i, nbytes);
+      crc32c_ = crc32c::Extend(crc32c_, &buffer_[0], nbytes);
+      position_ = nbytes;
+      TF_RETURN_IF_ERROR(Flush());
+    }
+    return Status::OK();
+  }
+  position_ += data.size();
+  return Status::OK();
+}
+
+Status FileOutputBuffer::Close() {
+  TF_RETURN_IF_ERROR(Flush());
+  return file_->Close();
+}
+
+Status FileOutputBuffer::Flush() {
+  if (position_ > 0) {
+    TF_RETURN_IF_ERROR(file_->Append(StringPiece(&buffer_[0], position_)));
+    position_ = 0;
+  }
+  return file_->Flush();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
new file mode 100644
index 00000000000..ea71041786b
--- /dev/null
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -0,0 +1,313 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A tensor bundle is a set of immutable persistent files storing a set of named
+// tensors.  It is designed for checkpointing TensorFlow tensors.
+//
+// The paths of the managed files share a common prefix; e.g., with the prefix:
+//   /fs/model/train/ckpt-step/ckpt
+//
+// the bundle may contain a metadata file, and sharded data files:
+//   /fs/model/train/ckpt-step/
+//       ckpt.index
+//       ckpt.data-00000-of-00020
+//       ckpt.data-00001-of-00020
+//       ...
+//       ckpt.data-00019-of-00020
+//
+// The ".index" file is a string-string immutable table
+// (tensorflow::table::Table).  Each key is a name of a tensor and its value is
+// a serialized BundleEntryProto.  Each BundleEntryProto describes the metadata
+// of a tensor: which of the "data" files contains the content of a tensor, the
+// offset into that file, checksum, some auxilary data, etc.
+//
+// A tensor bundle can be accessed randomly using a BundleReader.  Usage:
+//
+//   BundleReader reader(env, "/fs/model/train/ckpt-step/ckpt");
+//   reader.Lookup("name", &tensor);
+//
+// A tensor bundle can be built using BundleWriter.  Each BundleWriter builds a
+// single data file bundle.  Multiple bundles can then be merged by
+// MergeBundles() without reading and writing large chunk of data: it reads the
+// metadata files and outputs a single merged metadata.  Typical usage:
+//
+//   worker 0:
+//     BundleWriter writer(env, "/fs/model/train/ckpt-step/tmp/worker0-step");
+//     writer.Add(...);  // Adds the tensors on this worker.
+//     writer.Finish();  // Flushes.
+//   worker 1:
+//     BundleWriter writer(env, "/fs/model/train/ckpt-step/tmp/worker1-step");
+//     writer.Add(...);
+//     writer.Finish();
+//   worker 2:
+//     MergeBundles(env,
+//       {"/fs/model/train/ckpt-step/tmp/worker0-step",
+//        "/fs/model/train/ckpt-step/tmp/worker1-step"},
+//       "/fs/model/train/ckpt-step/ckpt" /* merged prefix */);
+//
+
+#ifndef TENSORFLOW_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
+#define TENSORFLOW_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
+
+#include "tensorflow/core/protobuf/tensor_bundle.pb.h"
+
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/table.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/tensor_slice_set.h"
+
+namespace tensorflow {
+
+class FileOutputBuffer;
+
+// Versioning of the tensor bundle format.
+// Follows the same rules as 3p/tf/core/public/version.h.
+//
+// History:
+// 0. Any tensor bundles produced before this field was added.
+// 1. Added this field (2016-09-14).
+extern const int kTensorBundleMinProducer;
+extern const int kTensorBundleMinConsumer;
+extern const int kTensorBundleVersion;
+
+// The empty string, hence always the first key in the metadata table.  Its
+// corresponding value is a BundleHeaderProto.
+extern const char* const kHeaderEntryKey;
+
+// Builds a string-string table of tensor names to BundleEntryProto (metadata).
+// All threads accessing the same BundleWriter must synchronize.
+class BundleWriter {
+ public:
+  BundleWriter(Env* env, const string& prefix);
+  ~BundleWriter();
+
+  // Adds the tensor "val" under key "key".
+  // Across calls "key" must be unique but can be added in any order.
+  Status Add(const string& key, const Tensor& val);
+
+  // Partitioned variables support.
+  // A slice of a full tensor is stored in two entries in the metadata table:
+  //
+  //   full_tensor_key   -> BundleEntryProto, describing all stored slices
+  //                        of this full tensor.  Does not append to the data
+  //                        file.
+  //   encoded slice key -> BundleEntryProto, describing one particular slice.
+  //                        Appends values of this slice to the data file.
+  //
+  // Slices of a full tensor can be added in any order.
+  //
+  // If a full tensor has slices placed on N devices and N BundleWriter's are
+  // concurrently used, the caller must use MergeBundles() to ensure that a
+  // consistent entry for "full_tensor_key" is produced.
+  //
+  // Returns an error if the same slice is added the second time.
+  Status AddSlice(const string& full_tensor_key,
+                  const TensorShape& full_tensor_shape,
+                  const TensorSlice& slice_spec, const Tensor& slice_tensor);
+
+  // Finishes the writer and flushes.
+  Status Finish() TF_MUST_USE_RESULT;
+
+  Status status() const { return status_; }
+
+ private:
+  Env* const env_;  // Not owned.
+  const string prefix_;
+  std::unique_ptr<FileOutputBuffer> out_;
+  int64 size_;  // Number of bytes written into out_.
+  std::map<string, BundleEntryProto> entries_;
+  Status status_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(BundleWriter);
+};
+
+// Merges a set of bundles (given their prefixes) into a single bundle with the
+// given "merged_prefix".  The merged metadata is guaranteed to be consistent.
+//
+// If there are N bundles in "prefixes", during the merge the data files will be
+// renamed to contain a proper sharded file spec, with num_shards set to the sum
+// of num_shards across the N input bundles.
+//
+// The caller should only rely on the metadata file of the merged bundle to
+// query information about a tensor.  In particular, this function does not
+// guarantee not to re-order the input data files.
+//
+// Once merged, makes a best effort to delete the old metadata files.
+// Returns OK iff all bundles are successfully merged.
+Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
+                    const string& merged_prefix);
+
+// On construction, silently attempts to read the metadata associated with
+// "prefix".  If caller intends to call any function afterwards, "status()"
+// must be checked.
+// All threads accessing the same BundleReader must synchronize.
+class BundleReader {
+ public:
+  BundleReader(Env* const env, const string& prefix);
+  ~BundleReader();
+
+  // Is ok() iff the reader construction is successful (completed the read of
+  // the metadata).
+  Status status() const { return status_; }
+
+  // Looks up the shape of the tensor keyed by "key".
+  // Clears "shape" if not found.
+  // REQUIRES: status().ok()
+  Status LookupTensorShape(const string& key,
+                           TensorShape* shape) TF_MUST_USE_RESULT;
+
+  // Looks up the tensor keyed by "key".  If "key" refers to a partitioned
+  // tensor, attempts to look up the full contents using all stored slices.
+  //
+  // Out-tensor "val" can be either empty or initialized with a non-empty shape:
+  //
+  // * If empty, this function allocates an exactly-sized Tensor to hold the
+  //   contents found in this bundle.
+  //
+  // * If non-empty, caller is responsible for making sure "val" has the same
+  //   shape as the corresponding contents. This function directly uses the
+  //   buffer without extra allocation.
+  //
+  // On error, "val" may contain nonsense data.  Returns a NotFound error if
+  // tensor keyed by "key" does not exist in this bundle.
+  //
+  // Validates the stored crc32c checksum against the restored bytes.
+  // REQUIRES: status().ok()
+  Status Lookup(const string& key, Tensor* val) TF_MUST_USE_RESULT;
+
+  // Looks up a specific slice of a partitioned tensor.
+  // It is only required that the stored slices cover the requested slice,
+  // namely "slice_spec" is a subset of the union of the stored slices.
+  // REQUIRES: status().ok()
+  Status LookupSlice(const string& full_tensor_key,
+                     const TensorSlice& slice_spec,
+                     Tensor* val) TF_MUST_USE_RESULT;
+
+  // Seeks to the first position in the bundle whose key is no less than "key".
+  // REQUIRES: status().ok()
+  void Seek(const string& key) { return iter_->Seek(key); }
+  // Moves to the next position in the bundle.
+  // REQUIRES: status().ok()
+  void Next() const { iter_->Next(); }
+  // Returns true iff the reader is positioned to a key/val pair.
+  // REQUIRES: status().ok()
+  bool Valid() const { return iter_->Valid(); }
+
+  // Returns the key at the current position.
+  // REQUIRES: status().ok() && Valid()
+  StringPiece key() const { return iter_->key(); }
+  // Returns the raw value at the current position.
+  // REQUIRES: status().ok() && Valid()
+  StringPiece value() const { return iter_->value(); }
+
+ private:
+  // Seeks for "key" and reads the metadata proto.
+  // On non-OK return, clears "entry" for the caller.
+  // REQUIRES: status().ok()
+  Status GetBundleEntryProto(const string& key,
+                             BundleEntryProto* entry) TF_MUST_USE_RESULT;
+
+  // Reads the tensor value described by the metadata proto "entry".
+  // Usage for "val" follows the comment of "Lookup()".
+  Status GetValue(const BundleEntryProto& entry,
+                  Tensor* val) TF_MUST_USE_RESULT;
+
+  // Reads the slice described by "slice_spec".  The corresponding full tensor
+  // has key "ful_tensor_key" and metadata proto "full_tensor_entry".
+  // REQUIRES: full_tensor_entry.slices_size() > 0
+  Status GetSliceValue(const string& full_tensor_key,
+                       const BundleEntryProto& full_tensor_entry,
+                       const TensorSlice& slice_spec,
+                       Tensor* val) TF_MUST_USE_RESULT;
+
+  Env* env_;  // Not owned.
+  const string prefix_;
+
+  Status status_;
+  RandomAccessFile* metadata_;  // Owned.
+  table::Table* table_;
+  table::Iterator* iter_;
+  std::unordered_map<int32, io::InputBuffer*> data_;
+
+  // Maps each partitioned tensor's key to its stored slices (represented in a
+  // TensorSliceSet).  Populated on-demand.
+  std::unordered_map<string, checkpoint::TensorSliceSet*> tensor_slices_;
+
+  // Expected number of data file shards in the bundle.  Extracted by reading
+  // the header entry in the metadata table.
+  int num_shards_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(BundleReader);
+};
+
+// A buffering wrapper for a WritableFile.  Useful if the caller wishes to issue
+// small writes to a file (e.g. writing out a list of small varints).
+// External synchronization must be used in the presence of concurrent callers.
+class FileOutputBuffer {
+ public:
+  FileOutputBuffer(WritableFile* file, size_t buffer_size)
+      : file_(file), position_(0), buffer_size_(buffer_size) {
+    DCHECK_GT(buffer_size, 0);
+    buffer_.resize(buffer_size);
+  }
+  ~FileOutputBuffer();
+
+  // Buffered append.
+  Status Append(StringPiece data);
+
+  // Returns the running crc32c checksum of all currently appended bytes.
+  uint32 crc32c() { return crc32c_; }
+  // Clears the running crc32c checksum.
+  void clear_crc32c() { crc32c_ = 0; }
+
+  // Appends the buffered data, then closes the underlying file.
+  Status Close();
+
+ private:
+  // Appends the buffered data and flushes.
+  Status Flush();
+
+  WritableFile* file_;  // Owned.
+
+  // buffer_[0, position_) holds the buffered data not yet appended to the
+  // underlying file.
+  size_t position_;
+  const size_t buffer_size_;
+  std::vector<char> buffer_;
+
+  // Checksum of all appended bytes since construction or last clear_crc32c().
+  uint32 crc32c_ = 0;
+};
+
+// Pattern: "<prefix>.data-<padded shard_id>-of-<padded num_shards>".
+string DataFilename(const string& prefix, int32 shard_id, int32 num_shards);
+// Pattern: "<prefix>.index."
+string MetaFilename(const string& prefix);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
new file mode 100644
index 00000000000..c48205866b4
--- /dev/null
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -0,0 +1,585 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+
+#include <random>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/io/table_builder.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace {
+
+string Prefix(const string& prefix) {
+  return strings::StrCat(testing::TmpDir(), "/", prefix);
+}
+
+template <typename T>
+Tensor Constant(T v, TensorShape shape) {
+  Tensor ret(DataTypeToEnum<T>::value, shape);
+  ret.flat<T>().setConstant(v);
+  return ret;
+}
+
+template <typename T>
+Tensor Constant_2x3(T v) {
+  return Constant(v, TensorShape({2, 3}));
+}
+
+template <typename T>
+void Expect(BundleReader* reader, const string& key,
+            const Tensor& expected_val) {
+  // Tests for LookupTensorShape().
+  TensorShape shape;
+  TF_ASSERT_OK(reader->LookupTensorShape(key, &shape));
+  EXPECT_EQ(expected_val.shape(), shape);
+  // Tests for Lookup(), checking tensor contents.
+  Tensor val(expected_val.dtype(), shape);
+  TF_ASSERT_OK(reader->Lookup(key, &val));
+  test::ExpectTensorEqual<T>(val, expected_val);
+}
+
+std::vector<string> AllTensorKeys(BundleReader* reader) {
+  std::vector<string> ret;
+  reader->Seek(kHeaderEntryKey);
+  reader->Next();
+  for (; reader->Valid(); reader->Next()) {
+    ret.push_back(reader->key().ToString());
+  }
+  return ret;
+}
+
+// Writes out the metadata file of a bundle again, with the endianness marker
+// bit flipped.
+Status FlipEndiannessBit(const string& prefix) {
+  Env* env = Env::Default();
+  const string metadata_tmp_path = Prefix("some_tmp_path");
+  std::unique_ptr<WritableFile> file;
+  TF_RETURN_IF_ERROR(env->NewWritableFile(metadata_tmp_path, &file));
+  table::TableBuilder builder(table::Options(), file.get());
+
+  // Reads the existing metadata file, and fills the builder.
+  {
+    const string filename = MetaFilename(prefix);
+    uint64 file_size;
+    TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
+    std::unique_ptr<RandomAccessFile> file;
+    TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
+
+    table::Table* table = nullptr;
+    TF_RETURN_IF_ERROR(
+        table::Table::Open(table::Options(), file.get(), file_size, &table));
+    std::unique_ptr<table::Table> table_deleter(table);
+    std::unique_ptr<table::Iterator> iter(table->NewIterator());
+
+    // Reads the header entry.
+    iter->Seek(kHeaderEntryKey);
+    CHECK(iter->Valid());
+    BundleHeaderProto header;
+    CHECK(header.ParseFromArray(iter->value().data(), iter->value().size()));
+    // Flips the endianness.
+    if (header.endianness() == BundleHeaderProto::LITTLE) {
+      header.set_endianness(BundleHeaderProto::BIG);
+    } else {
+      header.set_endianness(BundleHeaderProto::LITTLE);
+    }
+    builder.Add(iter->key(), header.SerializeAsString());
+    iter->Next();
+
+    // Adds the non-header entries unmodified.
+    for (; iter->Valid(); iter->Next()) builder.Add(iter->key(), iter->value());
+  }
+  TF_RETURN_IF_ERROR(builder.Finish());
+  TF_RETURN_IF_ERROR(env->RenameFile(metadata_tmp_path, MetaFilename(prefix)));
+  return file->Close();
+}
+
+template <typename T>
+void TestBasic() {
+  {
+    BundleWriter writer(Env::Default(), Prefix("foo"));
+    writer.Add("foo_003", Constant_2x3<T>(3));
+    writer.Add("foo_000", Constant_2x3<T>(0));
+    writer.Add("foo_002", Constant_2x3<T>(2));
+    writer.Add("foo_001", Constant_2x3<T>(1));
+    TF_ASSERT_OK(writer.Finish());
+  }
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    EXPECT_EQ(
+        AllTensorKeys(&reader),
+        std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
+    Expect<T>(&reader, "foo_000", Constant_2x3<T>(0));
+    Expect<T>(&reader, "foo_001", Constant_2x3<T>(1));
+    Expect<T>(&reader, "foo_002", Constant_2x3<T>(2));
+    Expect<T>(&reader, "foo_003", Constant_2x3<T>(3));
+  }
+  {
+    BundleWriter writer(Env::Default(), Prefix("bar"));
+    writer.Add("bar_003", Constant_2x3<T>(3));
+    writer.Add("bar_000", Constant_2x3<T>(0));
+    writer.Add("bar_002", Constant_2x3<T>(2));
+    writer.Add("bar_001", Constant_2x3<T>(1));
+    TF_ASSERT_OK(writer.Finish());
+  }
+  {
+    BundleReader reader(Env::Default(), Prefix("bar"));
+    TF_ASSERT_OK(reader.status());
+    EXPECT_EQ(
+        AllTensorKeys(&reader),
+        std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003"}));
+    Expect<T>(&reader, "bar_003", Constant_2x3<T>(3));
+    Expect<T>(&reader, "bar_002", Constant_2x3<T>(2));
+    Expect<T>(&reader, "bar_001", Constant_2x3<T>(1));
+    Expect<T>(&reader, "bar_000", Constant_2x3<T>(0));
+  }
+  TF_ASSERT_OK(MergeBundles(Env::Default(), {Prefix("foo"), Prefix("bar")},
+                            Prefix("merged")));
+  {
+    BundleReader reader(Env::Default(), Prefix("merged"));
+    TF_ASSERT_OK(reader.status());
+    EXPECT_EQ(
+        AllTensorKeys(&reader),
+        std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003",
+                             "foo_000", "foo_001", "foo_002", "foo_003"}));
+    Expect<T>(&reader, "bar_000", Constant_2x3<T>(0));
+    Expect<T>(&reader, "bar_001", Constant_2x3<T>(1));
+    Expect<T>(&reader, "bar_002", Constant_2x3<T>(2));
+    Expect<T>(&reader, "bar_003", Constant_2x3<T>(3));
+    Expect<T>(&reader, "foo_000", Constant_2x3<T>(0));
+    Expect<T>(&reader, "foo_001", Constant_2x3<T>(1));
+    Expect<T>(&reader, "foo_002", Constant_2x3<T>(2));
+    Expect<T>(&reader, "foo_003", Constant_2x3<T>(3));
+  }
+}
+
+template <typename T>
+void TestNonStandardShapes() {
+  {
+    BundleWriter writer(Env::Default(), Prefix("nonstandard"));
+    writer.Add("scalar", Constant<T>(0, TensorShape()));
+    writer.Add("non_standard0", Constant<T>(0, TensorShape({0, 1618})));
+    writer.Add("non_standard1", Constant<T>(0, TensorShape({16, 0, 18})));
+    TF_ASSERT_OK(writer.Finish());
+  }
+  {
+    BundleReader reader(Env::Default(), Prefix("nonstandard"));
+    TF_ASSERT_OK(reader.status());
+    Expect<T>(&reader, "scalar", Constant<T>(0, TensorShape()));
+    Expect<T>(&reader, "non_standard0", Constant<T>(0, TensorShape({0, 1618})));
+    Expect<T>(&reader, "non_standard1",
+              Constant<T>(0, TensorShape({16, 0, 18})));
+  }
+}
+
+// Writes a bundle to disk with a bad "version"; checks for "expected_error".
+void VersionTest(const VersionDef& version, StringPiece expected_error) {
+  const string path = Prefix("version_test");
+  {
+    // Prepare an empty bundle with the given version information.
+    BundleHeaderProto header;
+    *header.mutable_version() = version;
+
+    // Write the metadata file to disk.
+    std::unique_ptr<WritableFile> file;
+    TF_ASSERT_OK(Env::Default()->NewWritableFile(MetaFilename(path), &file));
+    table::TableBuilder builder(table::Options(), file.get());
+    builder.Add(kHeaderEntryKey, header.SerializeAsString());
+    TF_ASSERT_OK(builder.Finish());
+  }
+  // Read it back in and verify that we get the expected error.
+  BundleReader reader(Env::Default(), path);
+  EXPECT_TRUE(errors::IsInvalidArgument(reader.status()));
+  EXPECT_TRUE(
+      StringPiece(reader.status().error_message()).starts_with(expected_error));
+}
+
+}  // namespace
+
+TEST(TensorBundleTest, Basic) {
+  TestBasic<float>();
+  TestBasic<double>();
+  TestBasic<int32>();
+  TestBasic<uint8>();
+  TestBasic<int16>();
+  TestBasic<int8>();
+  TestBasic<complex64>();
+  TestBasic<complex128>();
+  TestBasic<int64>();
+  TestBasic<bool>();
+  TestBasic<qint32>();
+  TestBasic<quint8>();
+  TestBasic<qint8>();
+}
+
+TEST(TensorBundleTest, PartitionedVariables) {
+  const TensorShape kFullShape({5, 10});
+  // Adds two slices.
+  // First slice: column 0, all zeros.
+  // Second slice: column 1 to rest, all ones.
+  {
+    BundleWriter writer(Env::Default(), Prefix("foo"));
+    TensorSlice slice = TensorSlice::ParseOrDie("-:0,1");
+
+    TF_ASSERT_OK(writer.AddSlice("foo", kFullShape,
+                                 TensorSlice::ParseOrDie("-:0,1"),
+                                 Constant<float>(0., TensorShape({5, 1}))));
+    TF_ASSERT_OK(writer.AddSlice("foo", kFullShape,
+                                 TensorSlice::ParseOrDie("-:1,9"),
+                                 Constant<float>(1., TensorShape({5, 9}))));
+    TF_ASSERT_OK(writer.Finish());
+  }
+  // Reads in full.
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+
+    Tensor expected_val(DT_FLOAT, kFullShape);
+    test::FillFn<float>(&expected_val, [](int offset) -> float {
+      if (offset % 10 == 0) {
+        return 0;  // First column zeros.
+      }
+      return 1;  // Other columns ones.
+    });
+
+    Tensor val(DT_FLOAT, kFullShape);
+    TF_ASSERT_OK(reader.Lookup("foo", &val));
+    test::ExpectTensorEqual<float>(val, expected_val);
+  }
+  // Reads a slice consisting of first two columns, "cutting" both slices.
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+
+    // First two columns, "cutting" both slices.
+    const TensorSlice distinct_slice = TensorSlice::ParseOrDie("-:0,2");
+    Tensor expected_val(DT_FLOAT, TensorShape({5, 2}));
+    test::FillFn<float>(&expected_val, [](int offset) -> float {
+      if (offset % 2 == 0) {
+        return 0;  // First column zeros.
+      }
+      return 1;  // Other columns ones.
+    });
+
+    Tensor val(DT_FLOAT, TensorShape({5, 2}));
+    TF_ASSERT_OK(reader.LookupSlice("foo", distinct_slice, &val));
+    test::ExpectTensorEqual<float>(val, expected_val);
+  }
+  // Reads a slice consisting of columns 2-4, "cutting" the second slice only.
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+
+    const TensorSlice distinct_slice = TensorSlice::ParseOrDie("-:2,2");
+    Tensor val(DT_FLOAT, TensorShape({5, 2}));
+    TF_ASSERT_OK(reader.LookupSlice("foo", distinct_slice, &val));
+    test::ExpectTensorEqual<float>(val,
+                                   Constant<float>(1., TensorShape({5, 2})));
+  }
+}
+
+TEST(TensorBundleTest, NonStandardShapes) {
+  TestNonStandardShapes<float>();
+  TestNonStandardShapes<double>();
+  TestNonStandardShapes<int32>();
+  TestNonStandardShapes<uint8>();
+  TestNonStandardShapes<int16>();
+  TestNonStandardShapes<int8>();
+  TestNonStandardShapes<complex64>();
+  TestNonStandardShapes<complex128>();
+  TestNonStandardShapes<int64>();
+  TestNonStandardShapes<bool>();
+  TestNonStandardShapes<qint32>();
+  TestNonStandardShapes<quint8>();
+  TestNonStandardShapes<qint8>();
+}
+
+TEST(TensorBundleTest, StringTensors) {
+  {
+    BundleWriter writer(Env::Default(), Prefix("foo"));
+    writer.Add("string_tensor", Tensor(DT_STRING, TensorShape({1})));  // Empty.
+    writer.Add("scalar", test::AsTensor<string>({"hello"}));
+    writer.Add("strs", test::AsTensor<string>(
+                           {"hello", "", "x01", string(1 << 25, 'c')}));
+    // Mixes in some floats.
+    writer.Add("floats", Constant_2x3<float>(16.18));
+    TF_ASSERT_OK(writer.Finish());
+  }
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    EXPECT_EQ(
+        AllTensorKeys(&reader),
+        std::vector<string>({"floats", "scalar", "string_tensor", "strs"}));
+
+    Expect<string>(&reader, "string_tensor",
+                   Tensor(DT_STRING, TensorShape({1})));
+    Expect<string>(&reader, "scalar", test::AsTensor<string>({"hello"}));
+    Expect<string>(
+        &reader, "strs",
+        test::AsTensor<string>({"hello", "", "x01", string(1 << 25, 'c')}));
+    Expect<float>(&reader, "floats", Constant_2x3<float>(16.18));
+  }
+}
+
+TEST(TensorBundleTest, DirectoryStructure) {
+  Env* env = Env::Default();
+  // Writes two bundles.
+  const std::vector<string> kBundlePrefixes = {Prefix("worker0"),
+                                               Prefix("worker1")};
+  for (int i = 0; i < 2; ++i) {
+    BundleWriter writer(env, kBundlePrefixes[i]);
+    writer.Add(strings::StrCat("tensor", i), Constant_2x3<float>(0.));
+    TF_ASSERT_OK(writer.Finish());
+  }
+
+  // Ensures we have the expected files.
+  auto CheckDirFiles = [env](const string& bundle_prefix,
+                             gtl::ArraySlice<string> expected_files) {
+    StringPiece dir = io::Dirname(bundle_prefix);
+    for (const string& expected_file : expected_files) {
+      EXPECT_TRUE(env->FileExists(io::JoinPath(dir, expected_file)));
+    }
+  };
+
+  // Check we have:
+  //   worker<i>.index
+  //   worker<i>.data-00000-of-00001
+  CheckDirFiles(kBundlePrefixes[0],
+                {"worker0.index", "worker0.data-00000-of-00001"});
+  CheckDirFiles(kBundlePrefixes[1],
+                {"worker1.index", "worker1.data-00000-of-00001"});
+
+  // Trivially "merge" one bundle to some other location (i.e., a renaming).
+  const string kAnotherPrefix = Prefix("another");
+  TF_ASSERT_OK(MergeBundles(env, {kBundlePrefixes[0]}, kAnotherPrefix));
+  CheckDirFiles(kAnotherPrefix,
+                {"another.index", "another.data-00000-of-00001"});
+
+  // Performs actual merge of the two bundles.  Check we have:
+  //   merged.index
+  //   merged.data-00000-of-00002
+  //   merged.data-00001-of-00002
+  const string kMerged = Prefix("merged");
+  TF_ASSERT_OK(
+      MergeBundles(env, {kAnotherPrefix, kBundlePrefixes[1]}, kMerged));
+  CheckDirFiles(kMerged, {"merged.index", "merged.data-00000-of-00002",
+                          "merged.data-00001-of-00002"});
+}
+
+TEST(TensorBundleTest, Error) {
+  {  // Dup keys.
+    BundleWriter writer(Env::Default(), Prefix("dup"));
+    writer.Add("foo", Constant_2x3(1.f));
+    writer.Add("foo", Constant_2x3(2.f));
+    EXPECT_TRUE(
+        StringPiece(writer.status().ToString()).contains("duplicate key"));
+    EXPECT_FALSE(writer.Finish().ok());
+  }
+  {  // Double finish
+    BundleWriter writer(Env::Default(), Prefix("bad"));
+    EXPECT_TRUE(writer.Finish().ok());
+    EXPECT_FALSE(writer.Finish().ok());
+  }
+  {  // Not found.
+    BundleReader reader(Env::Default(), Prefix("nonexist"));
+    EXPECT_TRUE(StringPiece(reader.status().ToString()).contains("Not found"));
+  }
+}
+
+TEST(TensorBundleTest, Checksum) {
+  // Randomly flips a byte in [pos_lhs, end of data file), or exactly byte
+  // pos_lhs if exact_pos == True.
+  auto FlipByte = [](const string& prefix, int pos_lhs,
+                     bool exact_pos = false) {
+    DCHECK_GE(pos_lhs, 0);
+    const string& datafile = DataFilename(Prefix(prefix), 0, 1);
+    string data;
+    TF_ASSERT_OK(ReadFileToString(Env::Default(), datafile, &data));
+
+    int byte_pos = 0;
+    if (!exact_pos) {
+      std::mt19937 rng;
+      std::uniform_int_distribution<int> dist(pos_lhs, data.size() - 1);
+      byte_pos = dist(rng);
+    } else {
+      byte_pos = pos_lhs;
+    }
+    data[byte_pos] = ~data[byte_pos];
+    TF_ASSERT_OK(WriteStringToFile(Env::Default(), datafile, data));
+  };
+  // The lookup should fail with a checksum-related message.
+  auto ExpectLookupFails = [](const string& prefix, const string& key,
+                              const string& expected_msg, Tensor& val) {
+    BundleReader reader(Env::Default(), Prefix(prefix));
+    Status status = reader.Lookup(key, &val);
+    EXPECT_TRUE(errors::IsDataLoss(status));
+    EXPECT_TRUE(StringPiece(status.ToString()).contains(expected_msg));
+  };
+
+  // Corrupts a float tensor.
+  {
+    BundleWriter writer(Env::Default(), Prefix("singleton"));
+    writer.Add("foo", Constant_2x3(1.f));
+    TF_ASSERT_OK(writer.Finish());
+
+    FlipByte("singleton", 0 /* corrupts any byte */);
+    Tensor val(DT_FLOAT, TensorShape({2, 3}));
+    ExpectLookupFails("singleton", "foo",
+                      "Checksum does not match" /* expected fail msg */, val);
+  }
+  // Corrupts a string tensor.
+  {
+    auto WriteStrings = []() {
+      BundleWriter writer(Env::Default(), Prefix("strings"));
+      writer.Add("foo", test::AsTensor<string>({"hello", "world"}));
+      TF_ASSERT_OK(writer.Finish());
+    };
+    // Corrupts the first two bytes, which are the varint32-encoded lengths
+    // of the two string elements.  Should hit mismatch on length cksum.
+    for (int i = 0; i < 2; ++i) {
+      WriteStrings();
+      FlipByte("strings", i, true /* corrupts exactly byte i */);
+      Tensor val(DT_STRING, TensorShape({2}));
+      ExpectLookupFails(
+          "strings", "foo",
+          "length checksum does not match" /* expected fail msg */, val);
+    }
+    // Corrupts the string bytes, should hit an overall cksum mismatch.
+    WriteStrings();
+    FlipByte("strings", 2 /* corrupts starting from byte 2 */);
+    Tensor val(DT_STRING, TensorShape({2}));
+    ExpectLookupFails("strings", "foo",
+                      "Checksum does not match" /* expected fail msg */, val);
+  }
+}
+
+TEST(TensorBundleTest, Endianness) {
+  BundleWriter writer(Env::Default(), Prefix("end"));
+  writer.Add("key", Constant_2x3<float>(1.0));
+  TF_ASSERT_OK(writer.Finish());
+
+  // Flips the endianness bit.
+  TF_ASSERT_OK(FlipEndiannessBit(Prefix("end")));
+
+  BundleReader reader(Env::Default(), Prefix("end"));
+  EXPECT_TRUE(errors::IsUnimplemented(reader.status()));
+  EXPECT_TRUE(StringPiece(reader.status().ToString())
+                  .contains("different endianness from the reader"));
+}
+
+TEST(TensorBundleTest, TruncatedTensorContents) {
+  Env* env = Env::Default();
+  BundleWriter writer(env, Prefix("end"));
+  writer.Add("key", Constant_2x3<float>(1.0));
+  TF_ASSERT_OK(writer.Finish());
+
+  // Truncates the data file by one byte, so that we hit EOF.
+  const string datafile = DataFilename(Prefix("end"), 0, 1);
+  string data;
+  TF_ASSERT_OK(ReadFileToString(env, datafile, &data));
+  ASSERT_TRUE(!data.empty());
+  TF_ASSERT_OK(WriteStringToFile(env, datafile,
+                                 StringPiece(data.data(), data.size() - 1)));
+
+  BundleReader reader(env, Prefix("end"));
+  TF_ASSERT_OK(reader.status());
+  Tensor val(DT_FLOAT, TensorShape({2, 3}));
+#if defined(PLATFORM_GOOGLE)
+  EXPECT_EQ("Data loss: Requested 24 bytes but read 23 bytes.",
+            reader.Lookup("key", &val).ToString());
+#else
+  EXPECT_TRUE(errors::IsOutOfRange(reader.Lookup("key", &val)));
+#endif
+}
+
+TEST(TensorBundleTest, HeaderEntry) {
+  {
+    BundleWriter writer(Env::Default(), Prefix("b"));
+    writer.Add("key", Constant_2x3<float>(1.0));
+    TF_ASSERT_OK(writer.Finish());
+  }
+
+  // Extracts out the header.
+  BundleHeaderProto header;
+  {
+    BundleReader reader(Env::Default(), Prefix("b"));
+    TF_ASSERT_OK(reader.status());
+    reader.Seek(kHeaderEntryKey);
+    ASSERT_TRUE(reader.Valid());
+    ASSERT_TRUE(ParseProtoUnlimited(&header, reader.value().data(),
+                                    reader.value().size()));
+  }
+
+  // num_shards
+  EXPECT_EQ(1, header.num_shards());
+  // endianness
+  if (port::kLittleEndian) {
+    EXPECT_EQ(BundleHeaderProto::LITTLE, header.endianness());
+  } else {
+    EXPECT_EQ(BundleHeaderProto::BIG, header.endianness());
+  }
+  // version
+  EXPECT_GT(kTensorBundleVersion, 0);
+  EXPECT_EQ(kTensorBundleVersion, header.version().producer());
+  EXPECT_EQ(kTensorBundleMinConsumer, header.version().min_consumer());
+}
+
+TEST(TensorBundleTest, VersionTest) {
+  // Min consumer.
+  {
+    VersionDef versions;
+    versions.set_producer(kTensorBundleVersion + 1);
+    versions.set_min_consumer(kTensorBundleVersion + 1);
+    VersionTest(
+        versions,
+        strings::StrCat("Checkpoint min consumer version ",
+                        kTensorBundleVersion + 1, " above current version ",
+                        kTensorBundleVersion, " for TensorFlow"));
+  }
+  // Min producer.
+  {
+    VersionDef versions;
+    versions.set_producer(kTensorBundleMinProducer - 1);
+    VersionTest(
+        versions,
+        strings::StrCat("Checkpoint producer version ",
+                        kTensorBundleMinProducer - 1, " below min producer ",
+                        kTensorBundleMinProducer, " supported by TensorFlow"));
+  }
+  // Bad consumer.
+  {
+    VersionDef versions;
+    versions.set_producer(kTensorBundleVersion + 1);
+    versions.add_bad_consumers(kTensorBundleVersion);
+    VersionTest(
+        versions,
+        strings::StrCat(
+            "Checkpoint disallows consumer version ", kTensorBundleVersion,
+            ".  Please upgrade TensorFlow: this version is likely buggy."));
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/tensor_slice_reader.cc b/tensorflow/core/util/tensor_slice_reader.cc
index 53752069969..7e56f6b3072 100644
--- a/tensorflow/core/util/tensor_slice_reader.cc
+++ b/tensorflow/core/util/tensor_slice_reader.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/iterator.h"
-#include "tensorflow/core/lib/io/match.h"
 #include "tensorflow/core/lib/io/table.h"
 #include "tensorflow/core/lib/io/table_options.h"
 #include "tensorflow/core/platform/env.h"
@@ -110,7 +109,7 @@ TensorSliceReader::TensorSliceReader(const string& filepattern,
                                      int preferred_shard)
     : filepattern_(filepattern), open_function_(std::move(open_function)) {
   VLOG(1) << "TensorSliceReader for " << filepattern;
-  Status s = io::GetMatchingFiles(Env::Default(), filepattern, &fnames_);
+  Status s = Env::Default()->GetMatchingPaths(filepattern, &fnames_);
   if (!s.ok()) {
     status_ = errors::InvalidArgument(
         "Unsuccessful TensorSliceReader constructor: "
diff --git a/tensorflow/core/util/test_log.proto b/tensorflow/core/util/test_log.proto
index 61f29fe131c..4eb397670d5 100644
--- a/tensorflow/core/util/test_log.proto
+++ b/tensorflow/core/util/test_log.proto
@@ -171,4 +171,7 @@ message TestResults {
 
   // Run-specific parameters (arguments, etc)
   RunConfiguration run_configuration = 8;
+
+  // Benchmark target identifier.
+  string name = 9;
 };
diff --git a/tensorflow/examples/tutorials/estimators/abalone.py b/tensorflow/examples/tutorials/estimators/abalone.py
index c8cd2f81830..978af5c9c32 100644
--- a/tensorflow/examples/tutorials/estimators/abalone.py
+++ b/tensorflow/examples/tutorials/estimators/abalone.py
@@ -41,15 +41,12 @@ flags.DEFINE_string(
 
 tf.logging.set_verbosity(tf.logging.INFO)
 
-FEATURES = ["len", "diam", "height", "whole_weight", "shucked_weight",
-            "viscera_weight", "shell_weight"]
-
 # Learning rate for the model
 LEARNING_RATE = 0.001
 
 
 def maybe_download():
-  """May be downloads training data and returns train and test file names."""
+  """Maybe downloads training data and returns train and test file names."""
   if FLAGS.train_data:
     train_file_name = FLAGS.train_data
   else:
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index aa5e3fd9bf9..60fd433a206 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -63,6 +63,18 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "mnist_softmax",
+    srcs = [
+        "mnist_softmax.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":input_data",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_test(
     name = "fully_connected_feed_test",
     size = "small",
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index 6621d7bb397..8b469fd9d14 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -22,37 +22,54 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
+
 # Import data
 from tensorflow.examples.tutorials.mnist import input_data
 
 import tensorflow as tf
 
-flags = tf.app.flags
-FLAGS = flags.FLAGS
-flags.DEFINE_string('data_dir', '/tmp/data/', 'Directory for storing data')
+FLAGS = None
 
-mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
 
-sess = tf.InteractiveSession()
+def main(_):
+  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
 
-# Create the model
-x = tf.placeholder(tf.float32, [None, 784])
-W = tf.Variable(tf.zeros([784, 10]))
-b = tf.Variable(tf.zeros([10]))
-y = tf.nn.softmax(tf.matmul(x, W) + b)
+  # Create the model
+  x = tf.placeholder(tf.float32, [None, 784])
+  W = tf.Variable(tf.zeros([784, 10]))
+  b = tf.Variable(tf.zeros([10]))
+  y = tf.matmul(x, W) + b
 
-# Define loss and optimizer
-y_ = tf.placeholder(tf.float32, [None, 10])
-cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
-train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
+  # Define loss and optimizer
+  y_ = tf.placeholder(tf.float32, [None, 10])
 
-# Train
-tf.initialize_all_variables().run()
-for i in range(1000):
-  batch_xs, batch_ys = mnist.train.next_batch(100)
-  train_step.run({x: batch_xs, y_: batch_ys})
+  # The raw formulation of cross-entropy,
+  #
+  #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
+  #                                 reduction_indices=[1]))
+  #
+  # can be numerically unstable.
+  #
+  # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
+  # outputs of 'y', and then average across the batch.
+  cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
+  train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 
-# Test trained model
-correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
-accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-print(accuracy.eval({x: mnist.test.images, y_: mnist.test.labels}))
+  # Train
+  tf.initialize_all_variables().run()
+  for _ in range(1000):
+    batch_xs, batch_ys = mnist.train.next_batch(100)
+    train_step.run({x: batch_xs, y_: batch_ys})
+
+  # Test trained model
+  correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+  print(accuracy.eval({x: mnist.test.images, y_: mnist.test.labels}))
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--data_dir', type=str, default='/tmp/data',
+                      help='Directory for storing data')
+  FLAGS = parser.parse_args()
+  tf.app.run()
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 70c9ad4f2f0..f1eff297180 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -24,19 +24,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
+
 import tensorflow as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
 
-flags = tf.app.flags
-FLAGS = flags.FLAGS
-flags.DEFINE_boolean('fake_data', False, 'If true, uses fake data '
-                     'for unit testing.')
-flags.DEFINE_integer('max_steps', 1000, 'Number of steps to run trainer.')
-flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.')
-flags.DEFINE_float('dropout', 0.9, 'Keep probability for training dropout.')
-flags.DEFINE_string('data_dir', '/tmp/data', 'Directory for storing data')
-flags.DEFINE_string('summaries_dir', '/tmp/mnist_logs', 'Summaries directory')
+FLAGS = None
 
 
 def train():
@@ -111,12 +105,23 @@ def train():
     tf.scalar_summary('dropout_keep_probability', keep_prob)
     dropped = tf.nn.dropout(hidden1, keep_prob)
 
-  y = nn_layer(dropped, 500, 10, 'layer2', act=tf.nn.softmax)
+  # Do not apply softmax activation yet, see below.
+  y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)
 
   with tf.name_scope('cross_entropy'):
-    diff = y_ * tf.log(y)
+    # The raw formulation of cross-entropy,
+    #
+    # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
+    #                               reduction_indices=[1]))
+    #
+    # can be numerically unstable.
+    #
+    # So here we use tf.nn.softmax_cross_entropy_with_logits on the
+    # raw outputs of the nn_layer above, and then average across
+    # the batch.
+    diff = tf.nn.softmax_cross_entropy_with_logits(y, y_)
     with tf.name_scope('total'):
-      cross_entropy = -tf.reduce_mean(diff)
+      cross_entropy = tf.reduce_mean(diff)
     tf.scalar_summary('cross entropy', cross_entropy)
 
   with tf.name_scope('train'):
@@ -182,4 +187,19 @@ def main(_):
 
 
 if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--fake_data', nargs='?', const=True, type=bool,
+                      default=False,
+                      help='If true, uses fake data for unit testing.')
+  parser.add_argument('--max_steps', type=int, default=1000,
+                      help='Number of steps to run trainer.')
+  parser.add_argument('--learning_rate', type=float, default=0.001,
+                      help='Initial learning rate')
+  parser.add_argument('--dropout', type=float, default=0.9,
+                      help='Keep probability for training dropout.')
+  parser.add_argument('--data_dir', type=str, default='/tmp/data',
+                      help='Directory for storing data')
+  parser.add_argument('--summaries_dir', type=str, default='/tmp/mnist_logs',
+                      help='Summaries directory')
+  FLAGS = parser.parse_args()
   tf.app.run()
diff --git a/tensorflow/g3doc/api_docs/python/array_ops.md b/tensorflow/g3doc/api_docs/python/array_ops.md
index c3139a1973b..658e62edc4a 100644
--- a/tensorflow/g3doc/api_docs/python/array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/array_ops.md
@@ -178,6 +178,37 @@ tf.cast(a, tf.int32) ==> [1, 2]  # dtype=tf.int32
 *  <b>`TypeError`</b>: If `x` cannot be cast to the `dtype`.
 
 
+- - -
+
+### `tf.bitcast(input, type, name=None)` {#bitcast}
+
+Bitcasts a tensor from one type to another without copying data.
+
+Given a tensor `input`, this operation returns a tensor that has the same buffer
+data as `input` with datatype `type`.
+
+If the input datatype `T` is larger than the output datatype `type` then the
+shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
+
+If `T` is smaller than `type`, the operator requires that the rightmost
+dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
+[..., sizeof(`type`)/sizeof(`T`)] to [...].
+
+*NOTE*: Bitcast is implemented as a low-level cast, so machines with different
+endian orderings will give different results.
+
+##### Args:
+
+
+*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+*  <b>`type`</b>: A `tf.DType` from: `tf.float32, tf.float64, tf.int64, tf.int32, tf.uint8, tf.uint16, tf.int16, tf.int8, tf.complex64, tf.complex128, tf.qint8, tf.quint8, tf.qint32, tf.half`.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor` of type `type`.
+
+
 - - -
 
 ### `tf.saturate_cast(value, dtype, name=None)` {#saturate_cast}
@@ -234,6 +265,26 @@ shape(t) ==> [2, 2, 3]
   A `Tensor` of type `out_type`.
 
 
+- - -
+
+### `tf.shape_n(input, out_type=None, name=None)` {#shape_n}
+
+Returns shape of tensors.
+
+This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+
+##### Args:
+
+
+*  <b>`input`</b>: A list of at least 1 `Tensor` objects of the same type.
+*  <b>`out_type`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A list with the same number of `Tensor` objects as `input` of `Tensor` objects of type out_type.
+
+
 - - -
 
 ### `tf.size(input, name=None, out_type=tf.int32)` {#size}
@@ -1177,12 +1228,151 @@ Extract `patches` from `images` and put them in the "depth" output dimension.
   `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension.
 
 
+- - -
+
+### `tf.space_to_batch_nd(input, block_shape, paddings, name=None)` {#space_to_batch_nd}
+
+SpaceToBatch for N-D tensors of type T.
+
+This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+grid of blocks of shape `block_shape`, and interleaves these blocks with the
+"batch" dimension (0) such that in the output, the spatial dimensions
+`[1, ..., M]` correspond to the position within the grid, and the batch
+dimension combines both the position within a spatial block and the original
+batch position.  Prior to division into blocks, the spatial dimensions of the
+input are optionally zero padded according to `paddings`.  See below for a
+precise description.
+
+##### Args:
+
+
+*  <b>`input`</b>: A `Tensor`.
+    N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+    where spatial_shape has `M` dimensions.
+*  <b>`block_shape`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+    1-D with shape `[M]`, all values must be >= 1.
+*  <b>`paddings`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+    2-D with shape `[M, 2]`, all values must be >= 0.
+      `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+      `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+      `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
+
+    This operation is equivalent to the following steps:
+
+    1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+       input according to `paddings` to produce `padded` of shape `padded_shape`.
+
+    2. Reshape `padded` to `reshaped_padded` of shape:
+         [batch] +
+         [padded_shape[1] / block_shape[0],
+           block_shape[0],
+          ...,
+          padded_shape[M] / block_shape[M-1],
+          block_shape[M-1]] +
+         remaining_shape
+
+    3. Permute dimensions of `reshaped_padded` to produce
+       `permuted_reshaped_padded` of shape:
+         block_shape +
+         [batch] +
+         [padded_shape[1] / block_shape[0],
+          ...,
+          padded_shape[M] / block_shape[M-1]] +
+         remaining_shape
+
+    4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+       dimension, producing an output tensor of shape:
+         [batch * prod(block_shape)] +
+         [padded_shape[1] / block_shape[0],
+          ...,
+          padded_shape[M] / block_shape[M-1]] +
+         remaining_shape
+
+    Some examples:
+
+    (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+        `paddings = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    x = [[[[1], [2]], [[3], [4]]]]
+    ```
+
+    The output tensor has shape `[4, 1, 1, 1]` and value:
+
+    ```prettyprint
+    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+    ```
+
+    (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+        `paddings = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    x = [[[[1, 2, 3], [4, 5, 6]],
+          [[7, 8, 9], [10, 11, 12]]]]
+    ```
+
+    The output tensor has shape `[4, 1, 1, 3]` and value:
+
+    ```prettyprint
+    [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+    ```
+
+    (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+        `paddings = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    x = [[[[1],   [2],  [3],  [4]],
+          [[5],   [6],  [7],  [8]],
+          [[9],  [10], [11],  [12]],
+          [[13], [14], [15],  [16]]]]
+    ```
+
+    The output tensor has shape `[4, 2, 2, 1]` and value:
+
+    ```prettyprint
+    x = [[[[1], [3]], [[5], [7]]],
+         [[[2], [4]], [[10], [12]]],
+         [[[5], [7]], [[13], [15]]],
+         [[[6], [8]], [[14], [16]]]]
+    ```
+
+    (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+        paddings = `[[0, 0], [2, 0]]`:
+
+    ```prettyprint
+    x = [[[[1],   [2],  [3],  [4]],
+          [[5],   [6],  [7],  [8]]],
+         [[[9],  [10], [11],  [12]],
+          [[13], [14], [15],  [16]]]]
+    ```
+
+    The output tensor has shape `[8, 1, 3, 1]` and value:
+
+    ```prettyprint
+    x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+         [[[0], [2], [4]]], [[[0], [10], [12]]],
+         [[[0], [5], [7]]], [[[0], [13], [15]]],
+         [[[0], [6], [8]]], [[[0], [14], [16]]]]
+    ```
+
+    Among others, this operation is useful for reducing atrous convolution into
+    regular convolution.
+
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor`. Has the same type as `input`.
+
+
 - - -
 
 ### `tf.space_to_batch(input, paddings, block_size, name=None)` {#space_to_batch}
 
 SpaceToBatch for 4-D tensors of type T.
 
+This is a legacy version of the more general SpaceToBatchND.
+
 Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
 More specifically, this op outputs a copy of the input tensor where values from
 the `height` and `width` dimensions are moved to the `batch` dimension. After
@@ -1288,12 +1478,191 @@ block size.
   A `Tensor`. Has the same type as `input`.
 
 
+- - -
+
+### `tf.required_space_to_batch_paddings(input_shape, block_shape, base_paddings=None, name=None)` {#required_space_to_batch_paddings}
+
+Calculate padding required to make block_shape divide input_shape.
+
+This function can be used to calculate a suitable paddings argument for use
+with space_to_batch_nd and batch_to_space_nd.
+
+##### Args:
+
+
+*  <b>`input_shape`</b>: int32 Tensor of shape [N].
+*  <b>`block_shape`</b>: int32 Tensor of shape [N].
+*  <b>`base_paddings`</b>: Optional int32 Tensor of shape [N, 2].  Specifies the minimum
+    amount of padding to use.  All elements must be >= 0.  If not specified,
+    defaults to 0.
+*  <b>`name`</b>: string.  Optional name prefix.
+
+##### Returns:
+
+  (paddings, crops), where:
+
+  `paddings` and `crops` are int32 Tensors of rank 2 and shape [N, 2]
+
+*  <b>`satisfying`</b>: 
+
+      paddings[i, 0] = base_paddings[i, 0].
+      0 <= paddings[i, 1] - base_paddings[i, 1] < block_shape[i]
+      (input_shape[i] + paddings[i, 0] + paddings[i, 1]) % block_shape[i] == 0
+
+      crops[i, 0] = 0
+      crops[i, 1] = paddings[i, 1] - base_paddings[i, 1]
+
+
+*  <b>`Raises`</b>: ValueError if called with incompatible shapes.
+
+
+- - -
+
+### `tf.batch_to_space_nd(input, block_shape, crops, name=None)` {#batch_to_space_nd}
+
+BatchToSpace for N-D tensors of type T.
+
+This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+`block_shape + [batch]`, interleaves these blocks back into the grid defined by
+the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+the input.  The spatial dimensions of this intermediate result are then
+optionally cropped according to `crops` to produce the output.  This is the
+reverse of SpaceToBatch.  See below for a precise description.
+
+##### Args:
+
+
+*  <b>`input`</b>: A `Tensor`.
+    N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+    where spatial_shape has M dimensions.
+*  <b>`block_shape`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+    1-D with shape `[M]`, all values must be >= 1.
+*  <b>`crops`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+    2-D with shape `[M, 2]`, all values must be >= 0.
+      `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+      dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+      required that
+      `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+
+    This operation is equivalent to the following steps:
+
+    1. Reshape `input` to `reshaped` of shape:
+         [block_shape[0], ..., block_shape[M-1],
+          batch / prod(block_shape),
+          input_shape[1], ..., input_shape[N-1]]
+
+    2. Permute dimensions of `reshaped` to produce `permuted` of shape
+         [batch / prod(block_shape),
+
+          input_shape[1], block_shape[0],
+          ...,
+          input_shape[M], block_shape[M-1],
+
+          input_shape[M+1], ..., input_shape[N-1]]
+
+    3. Reshape `permuted` to produce `reshaped_permuted` of shape
+         [batch / prod(block_shape),
+
+          input_shape[1] * block_shape[0],
+          ...,
+          input_shape[M] * block_shape[M-1],
+
+          input_shape[M+1],
+          ...,
+          input_shape[N-1]]
+
+    4. Crop the start and end of dimensions `[1, ..., M]` of
+       `reshaped_permuted` according to `crops` to produce the output of shape:
+         [batch / prod(block_shape),
+
+          input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+          ...,
+          input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+
+          input_shape[M+1], ..., input_shape[N-1]]
+
+    Some examples:
+
+    (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+        `crops = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+    ```
+
+    The output tensor has shape `[1, 2, 2, 1]` and value:
+
+    ```prettyprint
+    x = [[[[1], [2]], [[3], [4]]]]
+    ```
+
+    (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+        `crops = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+    ```
+
+    The output tensor has shape `[1, 2, 2, 3]` and value:
+
+    ```prettyprint
+    x = [[[[1, 2, 3], [4, 5, 6]],
+          [[7, 8, 9], [10, 11, 12]]]]
+    ```
+
+    (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+        `crops = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    x = [[[[1], [3]], [[5], [7]]],
+         [[[2], [4]], [[10], [12]]],
+         [[[5], [7]], [[13], [15]]],
+         [[[6], [8]], [[14], [16]]]]
+    ```
+
+    The output tensor has shape `[1, 4, 4, 1]` and value:
+
+    ```prettyprint
+    x = [[[1],   [2],  [3],  [4]],
+         [[5],   [6],  [7],  [8]],
+         [[9],  [10], [11],  [12]],
+         [[13], [14], [15],  [16]]]
+    ```
+
+    (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+        `crops = [[0, 0], [2, 0]]`:
+
+    ```prettyprint
+    x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+         [[[0], [2], [4]]], [[[0], [10], [12]]],
+         [[[0], [5], [7]]], [[[0], [13], [15]]],
+         [[[0], [6], [8]]], [[[0], [14], [16]]]]
+    ```
+
+    The output tensor has shape `[2, 2, 4, 1]` and value:
+
+    ```prettyprint
+    x = [[[[1],   [2],  [3],  [4]],
+          [[5],   [6],  [7],  [8]]],
+         [[[9],  [10], [11],  [12]],
+          [[13], [14], [15],  [16]]]]
+    ```
+
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor`. Has the same type as `input`.
+
+
 - - -
 
 ### `tf.batch_to_space(input, crops, block_size, name=None)` {#batch_to_space}
 
 BatchToSpace for 4-D tensors of type T.
 
+This is a legacy version of the more general BatchToSpaceND.
+
 Rearranges (permutes) data from batch into blocks of spatial data, followed by
 cropping. This is the reverse transformation of SpaceToBatch. More specifically,
 this op outputs a copy of the input tensor where values from the `batch`
@@ -1715,6 +2084,46 @@ Batched indexing into a 3-tensor:
   `indices`.
 
 
+- - -
+
+### `tf.unique_with_counts(x, out_idx=None, name=None)` {#unique_with_counts}
+
+Finds unique elements in a 1-D tensor.
+
+This operation returns a tensor `y` containing all of the unique elements of `x`
+sorted in the same order that they occur in `x`. This operation also returns a
+tensor `idx` the same size as `x` that contains the index of each value of `x`
+in the unique output `y`. Finally, it returns a third tensor `count` that
+contains the count of each element of `y` in `x`. In other words:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```prettyprint
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx, count = unique_with_counts(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+count ==> [2, 1, 3, 1, 2]
+```
+
+##### Args:
+
+
+*  <b>`x`</b>: A `Tensor`. 1-D.
+*  <b>`out_idx`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A tuple of `Tensor` objects (y, idx, count).
+
+*  <b>`y`</b>: A `Tensor`. Has the same type as `x`. 1-D.
+*  <b>`idx`</b>: A `Tensor` of type `out_idx`. 1-D.
+*  <b>`count`</b>: A `Tensor` of type `out_idx`. 1-D.
+
+
 - - -
 
 ### `tf.dynamic_partition(data, partitions, num_partitions, name=None)` {#dynamic_partition}
@@ -2002,96 +2411,36 @@ The output will be
 *  <b>`TypeError`</b>: If dtype of `on_value` and `off_value` don't match one another
 
 
-
-## Other Functions and Classes
 - - -
 
-### `tf.bitcast(input, type, name=None)` {#bitcast}
+### `tf.sequence_mask(lengths, maxlen=None, dtype=tf.bool, name=None)` {#sequence_mask}
 
-Bitcasts a tensor from one type to another without copying data.
+Return a mask tensor representing the first N positions of each row.
 
-Given a tensor `input`, this operation returns a tensor that has the same buffer
-data as `input` with datatype `type`.
-
-If the input datatype `T` is larger than the output datatype `type` then the
-shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
-
-If `T` is smaller than `type`, the operator requires that the rightmost
-dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-[..., sizeof(`type`)/sizeof(`T`)] to [...].
-
-*NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-endian orderings will give different results.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-*  <b>`type`</b>: A `tf.DType` from: `tf.float32, tf.float64, tf.int64, tf.int32, tf.uint8, tf.uint16, tf.int16, tf.int8, tf.complex64, tf.complex128, tf.qint8, tf.quint8, tf.qint32, tf.half`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor` of type `type`.
-
-
-- - -
-
-### `tf.shape_n(input, out_type=None, name=None)` {#shape_n}
-
-Returns shape of tensors.
-
-This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-
-##### Args:
-
-
-*  <b>`input`</b>: A list of at least 1 `Tensor` objects of the same type.
-*  <b>`out_type`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A list with the same number of `Tensor` objects as `input` of `Tensor` objects of type out_type.
-
-
-- - -
-
-### `tf.unique_with_counts(x, out_idx=None, name=None)` {#unique_with_counts}
-
-Finds unique elements in a 1-D tensor.
-
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. Finally, it returns a third tensor `count` that
-contains the count of each element of `y` in `x`. In other words:
-
-`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-
-For example:
-
-```prettyprint
-# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-y, idx, count = unique_with_counts(x)
-y ==> [1, 2, 4, 7, 8]
-idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-count ==> [2, 1, 3, 1, 2]
+Example:
+```python
+tf.sequence_mask([1, 3, 2], 5) =
+  [[True, False, False, False, False],
+   [True, True, True, False, False],
+   [True, True, False, False, False]]
 ```
 
 ##### Args:
 
 
-*  <b>`x`</b>: A `Tensor`. 1-D.
-*  <b>`out_idx`</b>: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to `tf.int32`.
-*  <b>`name`</b>: A name for the operation (optional).
+*  <b>`lengths`</b>: 1D integer tensor, all its values < maxlen.
+*  <b>`maxlen`</b>: scalar integer tensor, maximum length of each row. Default: use
+          maximum over lengths.
+*  <b>`dtype`</b>: output type of the resulting tensor.
+*  <b>`name`</b>: name of the op.
 
 ##### Returns:
 
-  A tuple of `Tensor` objects (y, idx, count).
+  A 2D mask tensor, as shown in the example above, cast to specified dtype.
 
-*  <b>`y`</b>: A `Tensor`. Has the same type as `x`. 1-D.
-*  <b>`idx`</b>: A `Tensor` of type `out_idx`. 1-D.
-*  <b>`count`</b>: A `Tensor` of type `out_idx`. 1-D.
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the arguments have invalid rank.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.monte_carlo.md b/tensorflow/g3doc/api_docs/python/contrib.bayesflow.monte_carlo.md
index 2b1ab42011c..95f377ae315 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.bayesflow.monte_carlo.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.bayesflow.monte_carlo.md
@@ -46,7 +46,7 @@ Log E_q[ f(Z) p(Z) / q(Z) ]
 C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].
 ```
 
-The maximum value of the exponentiated term will be 0.0, and the the expecation
+The maximum value of the exponentiated term will be 0.0, and the the expectation
 can be evaluated in a stable manner.
 
 ## Ops
@@ -69,9 +69,7 @@ User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
 ##### Args:
 
 
-*  <b>`f`</b>: Callable mapping samples from `sampling_dist_q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `f` works "just like" `sampling_dist_q.log_prob`.
+*  <b>`f`</b>: Callable mapping samples from `p` to `Tensors`.
 *  <b>`p`</b>: `tf.contrib.distributions.BaseDistribution`.
 *  <b>`z`</b>: `Tensor` of samples from `p`, produced by `p.sample_n`.
 *  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
@@ -80,7 +78,36 @@ User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
 
 ##### Returns:
 
-  A `Tensor` with same `dtype` as `p`, and shape equal to `p.batch_shape`.
+  A `Tensor` with the same `dtype` as `p`.
+
+
+*  <b>`Example`</b>: 
+
+```python
+N_samples = 10000
+
+distributions = tf.contrib.distributions
+
+dist = distributions.Uniform([0.0, 0.0], [1.0, 2.0])
+elementwise_mean = lambda x: x
+mean_sum = lambda x: tf.reduce_sum(x, 1)
+
+estimate_elementwise_mean_tf = monte_carlo.expectation(elementwise_mean,
+                                                       dist,
+                                                       n=N_samples)
+estimate_mean_sum_tf = monte_carlo.expectation(mean_sum,
+                                               dist,
+                                               n=N_samples)
+
+with tf.Session() as sess:
+  estimate_elementwise_mean, estimate_mean_sum = (
+      sess.run([estimate_elementwise_mean_tf, estimate_mean_sum_tf]))
+print estimate_elementwise_mean
+>>> np.array([ 0.50018013  1.00097895], dtype=np.float32)
+print estimate_mean_sum
+>>> 1.49571
+
+```
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/contrib.crf.md b/tensorflow/g3doc/api_docs/python/contrib.crf.md
new file mode 100644
index 00000000000..8139c41cac9
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/contrib.crf.md
@@ -0,0 +1,214 @@
+<!-- This file is machine generated: DO NOT EDIT! -->
+
+# CRF (contrib)
+[TOC]
+
+Linear-chain CRF layer.
+
+## This package provides functions for building a linear-chain CRF layer.
+
+- - -
+
+### `tf.contrib.crf.crf_sequence_score(inputs, tag_indices, sequence_lengths, transition_params)` {#crf_sequence_score}
+
+Computes the unnormalized score for a tag sequence.
+
+##### Args:
+
+
+*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+      to use as input to the CRF layer.
+*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices for which we
+      compute the unnormalized score.
+*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix.
+
+##### Returns:
+
+
+*  <b>`sequence_scores`</b>: A [batch_size] vector of unnormalized sequence scores.
+
+
+- - -
+
+### `tf.contrib.crf.crf_log_norm(inputs, sequence_lengths, transition_params)` {#crf_log_norm}
+
+Computes the normalization for a CRF.
+
+##### Args:
+
+
+*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+      to use as input to the CRF layer.
+*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix.
+
+##### Returns:
+
+
+*  <b>`log_norm`</b>: A [batch_size] vector of normalizers for a CRF.
+
+
+- - -
+
+### `tf.contrib.crf.crf_log_likelihood(inputs, tag_indices, sequence_lengths, transition_params=None)` {#crf_log_likelihood}
+
+Computes the log-likehood of tag sequences in a CRF.
+
+##### Args:
+
+
+*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+      to use as input to the CRF layer.
+*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices for which we
+      compute the log-likehood.
+*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix, if available.
+
+##### Returns:
+
+
+*  <b>`log_likelihood`</b>: A scalar containing the log-likelihood of the given sequence
+      of tag indices.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix. This is either
+      provided by the caller or created in this function.
+
+
+- - -
+
+### `tf.contrib.crf.crf_unary_score(tag_indices, sequence_lengths, inputs)` {#crf_unary_score}
+
+Computes the unary scores of tag sequences.
+
+##### Args:
+
+
+*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices.
+*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
+*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
+
+##### Returns:
+
+
+*  <b>`unary_scores`</b>: A [batch_size] vector of unary scores.
+
+
+- - -
+
+### `tf.contrib.crf.crf_binary_score(tag_indices, sequence_lengths, transition_params)` {#crf_binary_score}
+
+Computes the binary scores of tag sequences.
+
+##### Args:
+
+
+*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices.
+*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
+
+##### Returns:
+
+
+*  <b>`binary_scores`</b>: A [batch_size] vector of binary scores.
+
+
+- - -
+
+### `class tf.contrib.crf.CrfForwardRnnCell` {#CrfForwardRnnCell}
+
+Computes the alpha values in a linear-chain CRF.
+
+See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
+- - -
+
+#### `tf.contrib.crf.CrfForwardRnnCell.__call__(inputs, state, scope=None)` {#CrfForwardRnnCell.__call__}
+
+Build the CrfForwardRnnCell.
+
+##### Args:
+
+
+*  <b>`inputs`</b>: A [batch_size, num_tags] matrix of unary potentials.
+*  <b>`state`</b>: A [batch_size, num_tags] matrix containing the previous alpha
+      values.
+*  <b>`scope`</b>: Unused variable scope of this cell.
+
+##### Returns:
+
+  new_alphas, new_alphas: A pair of [batch_size, num_tags] matrices
+      values containing the new alpha values.
+
+
+- - -
+
+#### `tf.contrib.crf.CrfForwardRnnCell.__init__(transition_params)` {#CrfForwardRnnCell.__init__}
+
+Initialize the CrfForwardRnnCell.
+
+##### Args:
+
+
+*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
+      This matrix is expanded into a [1, num_tags, num_tags] in preparation
+      for the broadcast summation occurring within the cell.
+
+
+- - -
+
+#### `tf.contrib.crf.CrfForwardRnnCell.output_size` {#CrfForwardRnnCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.crf.CrfForwardRnnCell.state_size` {#CrfForwardRnnCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.crf.CrfForwardRnnCell.zero_state(batch_size, dtype)` {#CrfForwardRnnCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+- - -
+
+### `tf.contrib.crf.viterbi_decode(score, transition_params)` {#viterbi_decode}
+
+Decode the highest scoring sequence of tags outside of TensorFlow.
+
+This should only be used at test time.
+
+##### Args:
+
+
+*  <b>`score`</b>: A [seq_len, num_tags] matrix of unary potentials.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
+
+##### Returns:
+
+
+*  <b>`viterbi`</b>: A [seq_len] list of integers containing the highest scoring tag
+      indicies.
+*  <b>`viterbi_score`</b>: A float containing the score for the viterbi sequence.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/contrib.distributions.md b/tensorflow/g3doc/api_docs/python/contrib.distributions.md
index d3c98cb23d2..282770be7be 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.distributions.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.distributions.md
@@ -14285,45 +14285,17 @@ mixture probabilities) and a list of `Distribution` objects
 all having matching dtype, batch shape, event shape, and continuity
 properties (the components).
 
-The user does not pass the list of distributions directly, but rather a
-list of `(constructor, batch_tensor_params_dict)` pairs,
-called `components`. The list of distributions is created via:
-
-```python
-distributions = [
-  c(**params_dict) for (c, params_dict) in zip(*components)
-]
-```
-
-This form allows for certain types of batch-shape optimizations within
-this class.
-
-An example of `components`:
-
-```python
-components = [
-  (tf.contrib.distributions.Normal, {"mu": 3.0, "sigma": 1.0}),
-  (functools.partial(tf.contrib.distributions.Normal, validate_args=False),
-   {"mu": 3.0, "sigma": 2.0}),
-  (tf.contrib.distributions.Normal.from_params,
-   {"mu": 1.0, "sigma": -1.0})
-]
-```
-
 The `num_classes` of `cat` must be possible to infer at graph construction
-time and match `len(distributions)`.
+time and match `len(components)`.
 
 ##### Args:
 
 
 *  <b>`cat`</b>: A `Categorical` distribution instance, representing the probabilities
       of `distributions`.
-*  <b>`components`</b>: A list or tuple of `(constructor, batch_tensor_params)`
-    tuples.  The `constructor` must be a callable, and `batch_tensor_params`
-    must be a dict mapping constructor kwargs to batchwise parameters.
-    Each `Distribution` instance created by calling
-    `constructor(**batch_tensor_params)` must have the same type, be defined
-    on the same domain, and have matching `event_shape` and `batch_shape`.
+*  <b>`components`</b>: A list or tuple of `Distribution` instances.
+    Each instance must have the same type, be defined on the same domain,
+    and have matching `event_shape` and `batch_shape`.
 *  <b>`validate_args`</b>: `Boolean`, default `False`.  If `True`, raise a runtime
     error if batch or event ranks are inconsistent between cat and any of
     the distributions.  This is only checked if the ranks cannot be
@@ -14339,16 +14311,13 @@ time and match `len(distributions)`.
 
 *  <b>`TypeError`</b>: If cat is not a `Categorical`, or `components` is not
     a list or tuple, or the elements of `components` are not
-    tuples of the form `(callable, dict)`, or the objects resulting
-    from calling `callable(**dict)` are not instances of `Distribution`, or
-    the resulting instances of `Distribution` do not have matching
-    continuity properties, or do not have matching `dtype`.
-*  <b>`ValueError`</b>: If `components` is an empty list or tuple, or the
-    distributions created from `components` do have a statically known event
-    rank.  If `cat.num_classes` cannot be inferred at graph creation time,
+    instances of `Distribution`, or do not have matching `dtype`.
+*  <b>`ValueError`</b>: If `components` is an empty list or tuple, or its
+    elements do not have a statically known event rank.
+    If `cat.num_classes` cannot be inferred at graph creation time,
     or the constant value of `cat.num_classes` is not equal to
-    `len(distributions)`, or all `distributions` and `cat` do not have
-    matching static batch shapes, or all components' distributions do not
+    `len(components)`, or all `components` and `cat` do not have
+    matching static batch shapes, or all components do not
     have matching static event shapes.
 
 
@@ -14427,7 +14396,7 @@ cdf(x) := P[X <= x]
 
 - - -
 
-#### `tf.contrib.distributions.Mixture.distributions` {#Mixture.distributions}
+#### `tf.contrib.distributions.Mixture.components` {#Mixture.components}
 
 
 
@@ -14453,7 +14422,7 @@ Shanon entropy in nats.
 A lower bound on the entropy of this mixture model.
 
 The bound below is not always very tight, and its usefulness depends
-on the mixture probabilities and the distributions in use.
+on the mixture probabilities and the components in use.
 
 A lower bound is useful for ELBO when the `Mixture` is the variational
 distribution:
@@ -15044,8 +15013,8 @@ Get the KL-divergence KL(dist_a || dist_b).
 ##### Args:
 
 
-*  <b>`dist_a`</b>: instance of distributions.Distribution.
-*  <b>`dist_b`</b>: instance of distributions.Distribution.
+*  <b>`dist_a`</b>: The first distribution.
+*  <b>`dist_b`</b>: The second distribution.
 *  <b>`allow_nan`</b>: If `False` (default), a runtime error is raised
     if the KL returns NaN values for any batch entry of the given
     distributions.  If `True`, the KL may return a NaN for the given entry.
@@ -15058,7 +15027,6 @@ Get the KL-divergence KL(dist_a || dist_b).
 ##### Raises:
 
 
-*  <b>`TypeError`</b>: If dist_a or dist_b is not an instance of Distribution.
 *  <b>`NotImplementedError`</b>: If no KL method is defined for distribution types
     of dist_a and dist_b.
 
@@ -15109,12 +15077,6 @@ Initialize the KL registrar.
 *  <b>`dist_cls_a`</b>: the class of the first argument of the KL divergence.
 *  <b>`dist_cls_b`</b>: the class of the second argument of the KL divergence.
 
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if dist_cls_a or dist_cls_b are not subclasses of
-    Distribution.
-
 
 
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md b/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md
index 674ba5e2b0e..572b7ccc1ac 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md
@@ -27,13 +27,16 @@ uncompressed_binary = ffmpeg.encode_audio(
 
 Create an op that decodes the contents of an audio file.
 
+Note that ffmpeg is free to select the "best" audio track from an mp4.
+https://trac.ffmpeg.org/wiki/Map
+
 ##### Args:
 
 
 *  <b>`contents`</b>: The binary contents of the audio file to decode. This is a
       scalar.
 *  <b>`file_format`</b>: A string specifying which format the contents will conform
-      to. This can be mp3, ogg, or wav.
+      to. This can be mp3, mp4, ogg, or wav.
 *  <b>`samples_per_second`</b>: The number of samples per second that is assumed.
       In some cases, resampling will occur to generate the correct sample
       rate.
diff --git a/tensorflow/g3doc/api_docs/python/contrib.layers.md b/tensorflow/g3doc/api_docs/python/contrib.layers.md
index 52b6c764d24..038780f3533 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.layers.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.layers.md
@@ -57,7 +57,7 @@ they need to be added as a dependency to the train_op, example:
 
   update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
   if update_ops:
-    updates = tf.group(update_ops)
+    updates = tf.group(*update_ops)
     total_loss = control_flow_ops.with_dependencies([updates], total_loss)
 
 One can set update_collections=None to force the updates in place, but that
@@ -74,7 +74,8 @@ can have speed penalty, specially in distributed settings.
     not used. When the next layer is linear (also e.g. `nn.relu`), this can be
     disabled since the scaling can be done by the next layer.
 *  <b>`epsilon`</b>: small float added to variance to avoid dividing by zero.
-*  <b>`activation_fn`</b>: Optional activation function.
+*  <b>`activation_fn`</b>: activation function, default set to None to skip it and
+    maintain a linear activation.
 *  <b>`updates_collections`</b>: collections to collect the update ops for computation.
     The updates_ops need to be excuted with the train_op.
     If None, a control dependency would be added to make sure the updates are
@@ -133,10 +134,12 @@ greater than one.
 *  <b>`rate`</b>: integer. If less than or equal to 1, a standard convolution is used.
     If greater than 1, than the a'trous convolution is applied and `stride`
     must be set to 1.
-*  <b>`activation_fn`</b>: activation function.
+*  <b>`activation_fn`</b>: activation function, set to None to skip it and maintain
+    a linear activation.
 *  <b>`normalizer_fn`</b>: normalization function to use instead of `biases`. If
-    `normalize_fn` is provided then `biases_initializer` and
+    `normalizer_fn` is provided then `biases_initializer` and
     `biases_regularizer` are ignored and `biases` are not created nor added.
+    default set to None for no normalizer function
 *  <b>`normalizer_params`</b>: normalization function parameters.
 *  <b>`weights_initializer`</b>: An initializer for the weights.
 *  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
@@ -188,10 +191,12 @@ operations such as image gradients:
     Can be an int if both strides are the same. Note that presently
     both strides must have the same value.
 *  <b>`padding`</b>: the padding type to use, either 'SAME' or 'VALID'.
-*  <b>`activation_fn`</b>: activation function.
+*  <b>`activation_fn`</b>: activation function, set to None to skip it and maintain
+    a linear activation.
 *  <b>`normalizer_fn`</b>: normalization function to use instead of `biases`. If
-    `normalize_fn` is provided then `biases_initializer` and
+    `normalizer_fn` is provided then `biases_initializer` and
     `biases_regularizer` are ignored and `biases` are not created nor added.
+    default set to None for no normalizer function
 *  <b>`normalizer_params`</b>: normalization function parameters.
 *  <b>`weights_initializer`</b>: An initializer for the weights.
 *  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
@@ -232,10 +237,12 @@ second variable called 'biases' is added to the result of the operation.
     Can be an int if both strides are the same.  Note that presently
     both strides must have the same value.
 *  <b>`padding`</b>: one of 'VALID' or 'SAME'.
-*  <b>`activation_fn`</b>: activation function.
+*  <b>`activation_fn`</b>: activation function, set to None to skip it and maintain
+    a linear activation.
 *  <b>`normalizer_fn`</b>: normalization function to use instead of `biases`. If
-    `normalize_fn` is provided then `biases_initializer` and
+    `normalizer_fn` is provided then `biases_initializer` and
     `biases_regularizer` are ignored and `biases` are not created nor added.
+    default set to None for no normalizer function
 *  <b>`normalizer_params`</b>: normalization function parameters.
 *  <b>`weights_initializer`</b>: An initializer for the weights.
 *  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
@@ -307,10 +314,12 @@ prior to the initial matrix multiply by `weights`.
 *  <b>`inputs`</b>: A tensor of with at least rank 2 and value for the last dimension,
     i.e. `[batch_size, depth]`, `[None, None, None, channels]`.
 *  <b>`num_outputs`</b>: Integer or long, the number of output units in the layer.
-*  <b>`activation_fn`</b>: activation function.
+*  <b>`activation_fn`</b>: activation function, set to None to skip it and maintain
+    a linear activation.
 *  <b>`normalizer_fn`</b>: normalization function to use instead of `biases`. If
-    `normalize_fn` is provided then `biases_initializer` and
+    `normalizer_fn` is provided then `biases_initializer` and
     `biases_regularizer` are ignored and `biases` are not created nor added.
+    default set to None for no normalizer function
 *  <b>`normalizer_params`</b>: normalization function parameters.
 *  <b>`weights_initializer`</b>: An initializer for the weights.
 *  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
@@ -356,7 +365,8 @@ Can be used as a normalizer function for conv2d and fully_connected.
 *  <b>`scale`</b>: If True, multiply by `gamma`. If False, `gamma` is
     not used. When the next layer is linear (also e.g. `nn.relu`), this can be
     disabled since the scaling can be done by the next layer.
-*  <b>`activation_fn`</b>: Optional activation function.
+*  <b>`activation_fn`</b>: activation function, default set to None to skip it and
+    maintain a linear activation.
 *  <b>`reuse`</b>: whether or not the layer and its variables should be reused. To be
     able to reuse the layer scope must be given.
 *  <b>`variables_collections`</b>: optional collections for the variables.
@@ -543,10 +553,12 @@ to produce the end result.
 *  <b>`stride`</b>: a list of length 2: [stride_height, stride_width], specifying the
     depthwise convolution stride. Can be an int if both strides are the same.
 *  <b>`padding`</b>: one of 'VALID' or 'SAME'.
-*  <b>`activation_fn`</b>: activation function.
+*  <b>`activation_fn`</b>: activation function, set to None to skip it and maintain
+    a linear activation.
 *  <b>`normalizer_fn`</b>: normalization function to use instead of `biases`. If
-    `normalize_fn` is provided then `biases_initializer` and
+    `normalizer_fn` is provided then `biases_initializer` and
     `biases_regularizer` are ignored and `biases` are not created nor added.
+    default set to None for no normalizer function
 *  <b>`normalizer_params`</b>: normalization function parameters.
 *  <b>`weights_initializer`</b>: An initializer for the weights.
 *  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.md b/tensorflow/g3doc/api_docs/python/contrib.learn.md
index 2b31ba08812..92307fc4497 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.md
@@ -675,19 +675,21 @@ Initializes a DNNClassifier instance.
 
   A `DNNClassifier` estimator.
 
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.__repr__()` {#DNNClassifier.__repr__}
+##### Raises:
 
 
+*  <b>`ValueError`</b>: If `n_classes` < 2.
 
 
 - - -
 
 #### `tf.contrib.learn.DNNClassifier.bias_` {#DNNClassifier.bias_}
 
+DEPRECATED FUNCTION
 
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-13.
+Instructions for updating:
+This method inspects the private state of the object, and should not be used
 
 
 - - -
@@ -697,100 +699,25 @@ Initializes a DNNClassifier instance.
 
 
 
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.dnn_bias_` {#DNNClassifier.dnn_bias_}
-
-Returns bias of deep neural network part.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.dnn_weights_` {#DNNClassifier.dnn_weights_}
-
-Returns weights of deep neural network part.
-
-
 - - -
 
 #### `tf.contrib.learn.DNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNClassifier.evaluate}
 
-See `Evaluable`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
+See evaluable.Evaluable.
 
 
 - - -
 
-#### `tf.contrib.learn.DNNClassifier.export(*args, **kwargs)` {#DNNClassifier.export}
+#### `tf.contrib.learn.DNNClassifier.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNClassifier.export}
 
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn and input_feature_key will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, targets), where features is a dict of
-        string key to `Tensor` and targets is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input.
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      prediction_key: The key for a tensor in the `predictions` dict (output
-        from the `model_fn`) to use as the `predictions` input to the
-        `signature_fn`. Optional. If `None`, predictions will pass to
-        `signature_fn` without filtering.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
+See BasEstimator.export.
 
 
 - - -
 
 #### `tf.contrib.learn.DNNClassifier.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNClassifier.fit}
 
-See `Trainable`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.get_params(deep=True)` {#DNNClassifier.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
+See trainable.Trainable.
 
 
 - - -
@@ -817,21 +744,7 @@ Returns value of the variable given by name.
 
 ##### Returns:
 
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.linear_bias_` {#DNNClassifier.linear_bias_}
-
-Returns bias of the linear part.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.linear_weights_` {#DNNClassifier.linear_weights_}
-
-Returns weights per feature of the linear part.
+  `Tensor` object.
 
 
 - - -
@@ -841,49 +754,6 @@ Returns weights per feature of the linear part.
 
 
 
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.partial_fit(x=None, y=None, input_fn=None, steps=1, batch_size=None, monitors=None)` {#DNNClassifier.partial_fit}
-
-Incremental fit on a batch of samples.
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
 - - -
 
 #### `tf.contrib.learn.DNNClassifier.predict(*args, **kwargs)` {#DNNClassifier.predict}
@@ -936,37 +806,15 @@ altogether. The behavior of this flag is described below.
       probabilities if as_iterable is True).
 
 
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.set_params(**params)` {#DNNClassifier.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
 - - -
 
 #### `tf.contrib.learn.DNNClassifier.weights_` {#DNNClassifier.weights_}
 
+DEPRECATED FUNCTION
 
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-13.
+Instructions for updating:
+This method inspects the private state of the object, and should not be used
 
 
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.metrics.md b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
index 6d684118ae8..bfe9afcfa8f 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.metrics.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
@@ -120,7 +120,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 *  <b>`predictions`</b>: The predicted values, a `Tensor` of any shape.
 *  <b>`labels`</b>: The ground truth values, a `Tensor` whose shape matches
     `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`metrics_collections`</b>: An optional list of collections that `accuracy` should
     be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
@@ -166,7 +166,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 
 *  <b>`values`</b>: A `Tensor` of arbitrary dimensions.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `values`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `values`.
 *  <b>`metrics_collections`</b>: An optional list of collections that `mean`
     should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op`
@@ -217,7 +217,7 @@ Instructions for updating:
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall` should
       be added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -269,7 +269,7 @@ Instructions for updating:
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `precision` should
       be added to.
     updates_collections: An optional list of collections that `update_op` should
@@ -322,7 +322,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 *  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
     are in the range `[0, 1]`.
 *  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`num_thresholds`</b>: The number of thresholds to use when discretizing the roc
     curve.
 *  <b>`metrics_collections`</b>: An optional list of collections that `auc` should be
@@ -384,7 +384,7 @@ Instructions for updating:
       `int64`.
     k: The number of top elements to look at for computing recall.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall_at_k`
       should be added to.
     updates_collections: An optional list of collections `update_op` should be
@@ -432,7 +432,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 *  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
 *  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`metrics_collections`</b>: An optional list of collections that
     `mean_absolute_error` should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
@@ -491,7 +491,7 @@ Instructions for updating:
       have. This value must be provided, since a confusion matrix of
       dimension = [num_classes, num_classes] will be allocated.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `mean_iou`
       should be added to.
     updates_collections: An optional list of collections `update_op` should be
@@ -538,7 +538,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 *  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
 *  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
 *  <b>`normalizer`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`metrics_collections`</b>: An optional list of collections that
     `mean_relative_error` should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
@@ -589,7 +589,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 *  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
 *  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`metrics_collections`</b>: An optional list of collections that
     `mean_squared_error` should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
@@ -640,7 +640,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 *  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
 *  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`metrics_collections`</b>: An optional list of collections that
     `root_mean_squared_error` should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
@@ -767,9 +767,9 @@ https://wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance
 ##### Raises:
 
 
-*  <b>`ValueError`</b>: If labels and predictions are of different sizes or if the
-    ignore_mask is of the wrong size or if either `metrics_collections` or
-    `updates_collections` are not a list or tuple.
+*  <b>`ValueError`</b>: If `labels` and `predictions` are of different sizes, or if
+    `weights` is the wrong size, or if either `metrics_collections` or
+    `updates_collections` are not a `list` or `tuple`.
 
 
 - - -
@@ -796,8 +796,8 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 *  <b>`predictions`</b>: A `Tensor` of the same shape as `labels`.
 *  <b>`labels`</b>: A `Tensor` of arbitrary shape.
 *  <b>`dim`</b>: The dimension along which the cosine distance is computed.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`, and whose
-    dimension `dim` is 1.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`,
+    and whose dimension `dim` is 1.
 *  <b>`metrics_collections`</b>: An optional list of collections that the metric
     value variable should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that the metric update
@@ -849,7 +849,7 @@ Instructions for updating:
     values: A numeric `Tensor` of arbitrary size.
     threshold: A scalar threshold.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `values`.
-    weights: An optional `Tensor` whose shape matches `values`.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
     metrics_collections: An optional list of collections that the metric
       value variable should be added to.
     updates_collections: An optional list of collections that the metric update
@@ -899,7 +899,7 @@ following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
     are in the range `[0, 1]`.
 *  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
 *  <b>`specificity`</b>: A scalar value in range `[0, 1]`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`num_thresholds`</b>: The number of thresholds to use for matching the given
     specificity.
 *  <b>`metrics_collections`</b>: An optional list of collections that `sensitivity`
@@ -926,6 +926,64 @@ following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
     or `updates_collections` are not a list or tuple.
 
 
+- - -
+
+### `tf.contrib.metrics.streaming_sparse_average_precision_at_k(predictions, labels, k, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_average_precision_at_k}
+
+Computes average precision@k of predictions with respect to sparse labels.
+
+See `sparse_average_precision_at_k` for details on formula. `weights` are
+applied to the result of `sparse_average_precision_at_k`
+
+`streaming_sparse_average_precision_at_k` creates two local variables,
+`average_precision_at_<k>/count` and `average_precision_at_<k>/total`, that
+are used to compute the frequency. This frequency is ultimately returned as
+`precision_at_<k>`: an idempotent operation that simply divides
+`true_positive_at_<k>` by total (`true_positive_at_<k>` +
+`false_positive_at_<k>`).
+
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+indicating the top `k` `predictions`. Set operations applied to `top_k` and
+`labels` calculate the true positives and false positives weighted by
+`weights`. Then `update_op` increments `true_positive_at_<k>` and
+`false_positive_at_<k>` using these values.
+
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
+    N >= 1. Commonly, N=1 and `predictions` has shape
+    [batch size, num_classes]. The final dimension contains the logit values
+    for each class. [D1, ... DN] must match `labels`.
+*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
+    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+    target classes for the associated prediction. Commonly, N=1 and `labels`
+    has shape [batch_size, num_labels]. [D1, ... DN] must match
+    `predictions_idx`. Values should be in range [0, num_classes], where
+    num_classes is the last dimension of `predictions`.
+*  <b>`k`</b>: Integer, k for @k metric. This will calculate an average precision for
+    range `[1,k]`, as documented above.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to the the first
+    [D1, ... DN] dimensions of `predictions` and `labels`.
+*  <b>`metrics_collections`</b>: An optional list of collections that values should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that updates should
+    be added to.
+*  <b>`name`</b>: Name of new update operation, and namespace for other dependant ops.
+
+##### Returns:
+
+
+*  <b>`mean_average_precision`</b>: Scalar `float64` `Tensor` with the mean average
+    precision values.
+*  <b>`update`</b>: `Operation` that increments  variables appropriately, and whose
+    value matches `metric`.
+
+
 - - -
 
 ### `tf.contrib.metrics.streaming_sparse_precision_at_k(*args, **kwargs)` {#streaming_sparse_precision_at_k}
@@ -1106,7 +1164,7 @@ following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
     are in the range `[0, 1]`.
 *  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
 *  <b>`sensitivity`</b>: A scalar value in range `[0, 1]`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`num_thresholds`</b>: The number of thresholds to use for matching the given
     sensitivity.
 *  <b>`metrics_collections`</b>: An optional list of collections that `specificity`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.RegisterKL.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.RegisterKL.md
index 569b9f3185a..07fe04d1224 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.RegisterKL.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.RegisterKL.md
@@ -40,10 +40,4 @@ Initialize the KL registrar.
 *  <b>`dist_cls_a`</b>: the class of the first argument of the KL divergence.
 *  <b>`dist_cls_b`</b>: the class of the second argument of the KL divergence.
 
-##### Raises:
-
-
-*  <b>`TypeError`</b>: if dist_cls_a or dist_cls_b are not subclasses of
-    Distribution.
-
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.convolution2d_in_plane.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.convolution2d_in_plane.md
index 47907c9b966..b84bf95639d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.convolution2d_in_plane.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.convolution2d_in_plane.md
@@ -23,10 +23,12 @@ operations such as image gradients:
     Can be an int if both strides are the same. Note that presently
     both strides must have the same value.
 *  <b>`padding`</b>: the padding type to use, either 'SAME' or 'VALID'.
-*  <b>`activation_fn`</b>: activation function.
+*  <b>`activation_fn`</b>: activation function, set to None to skip it and maintain
+    a linear activation.
 *  <b>`normalizer_fn`</b>: normalization function to use instead of `biases`. If
-    `normalize_fn` is provided then `biases_initializer` and
+    `normalizer_fn` is provided then `biases_initializer` and
     `biases_regularizer` are ignored and `biases` are not created nor added.
+    default set to None for no normalizer function
 *  <b>`normalizer_params`</b>: normalization function parameters.
 *  <b>`weights_initializer`</b>: An initializer for the weights.
 *  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn.md
index d9e935f8fb3..4b50b296dc8 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn.md
@@ -3,7 +3,7 @@
 Creates a recurrent neural network specified by RNNCell `cell`.
 
 The simplest form of RNN network generated is:
-```py
+```python
   state = cell.zero_state(...)
   outputs = []
   for input_ in inputs:
@@ -20,11 +20,13 @@ sequence length of the minibatch (thus saving computational time),
 and properly propagates the state at an example's sequence length
 to the final state output.
 
-The dynamic calculation performed is, at time t for batch row b,
+The dynamic calculation performed is, at time `t` for batch row `b`,
+```python
   (output, state)(b, t) =
     (t >= sequence_length(b))
       ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
       : cell(input(b, t), state(b, t - 1))
+```
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.py_func.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.py_func.md
index 5430b0ad8e0..dfb59ea089f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.py_func.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.py_func.md
@@ -21,8 +21,8 @@ sinh(x) as an op in the graph.
 
 *  <b>`func`</b>: A python function.
 *  <b>`inp`</b>: A list of `Tensor`.
-*  <b>`Tout`</b>: A list of tensorflow data types or a single tensorflow data type
-        indicating what `func` returns.
+*  <b>`Tout`</b>: A list or tuple of tensorflow data types or a single tensorflow data
+        type if there is only one, indicating what `func` returns.
 *  <b>`stateful`</b>: A boolean indicating whether the function should be considered
             stateful or stateless. I.e. whether it, given the same input, will
             return the same output and at the same time does not change state
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.accumulate_n.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.accumulate_n.md
index a85d0d7f87a..567df54e50d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.accumulate_n.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.accumulate_n.md
@@ -5,6 +5,9 @@ Returns the element-wise sum of a list of tensors.
 Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
 otherwise, these are inferred.
 
+NOTE: This operation is not differentiable and cannot be used if inputs depend
+on trainable variables. Please use tf.add_n for such cases.
+
 For example:
 
 ```python
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assign.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assign.md
new file mode 100644
index 00000000000..f72385be605
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assign.md
@@ -0,0 +1,28 @@
+### `tf.assign(ref, value, validate_shape=None, use_locking=None, name=None)` {#assign}
+
+Update 'ref' by assigning 'value' to it.
+
+This operation outputs "ref" after the assignment is done.
+This makes it easier to chain operations that need to use the reset value.
+
+##### Args:
+
+
+*  <b>`ref`</b>: A mutable `Tensor`.
+    Should be from a `Variable` node. May be uninitialized.
+*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
+    The value to be assigned to the variable.
+*  <b>`validate_shape`</b>: An optional `bool`. Defaults to `True`.
+    If true, the operation will validate that the shape
+    of 'value' matches the shape of the Tensor being assigned to.  If false,
+    'ref' will take on the shape of 'value'.
+*  <b>`use_locking`</b>: An optional `bool`. Defaults to `True`.
+    If True, the assignment will be protected by a lock;
+    otherwise the behavior is undefined, but may exhibit less contention.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  Same as "ref".  Returned as a convenience for operations that want
+  to use the new value after the variable has been reset.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_to_space.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_to_space.md
index 4a3bd703e58..a5ecd15aecc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_to_space.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_to_space.md
@@ -2,6 +2,8 @@
 
 BatchToSpace for 4-D tensors of type T.
 
+This is a legacy version of the more general BatchToSpaceND.
+
 Rearranges (permutes) data from batch into blocks of spatial data, followed by
 cropping. This is the reverse transformation of SpaceToBatch. More specifically,
 this op outputs a copy of the input tensor where values from the `batch`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.crf.crf_log_likelihood.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.crf.crf_log_likelihood.md
new file mode 100644
index 00000000000..6fdfb470c5d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.crf.crf_log_likelihood.md
@@ -0,0 +1,22 @@
+### `tf.contrib.crf.crf_log_likelihood(inputs, tag_indices, sequence_lengths, transition_params=None)` {#crf_log_likelihood}
+
+Computes the log-likehood of tag sequences in a CRF.
+
+##### Args:
+
+
+*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+      to use as input to the CRF layer.
+*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices for which we
+      compute the log-likehood.
+*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix, if available.
+
+##### Returns:
+
+
+*  <b>`log_likelihood`</b>: A scalar containing the log-likelihood of the given sequence
+      of tag indices.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix. This is either
+      provided by the caller or created in this function.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_mean_squared_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_mean_squared_error.md
index aba4c3f55ab..66ebfa248a8 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_mean_squared_error.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_mean_squared_error.md
@@ -23,7 +23,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 *  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
 *  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`metrics_collections`</b>: An optional list of collections that
     `mean_squared_error` should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.einsum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.einsum.md
new file mode 100644
index 00000000000..75db85a75bc
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.einsum.md
@@ -0,0 +1,6 @@
+### `tf.einsum(axes, *inputs)` {#einsum}
+
+A generalized contraction between tensors of arbitrary dimension.
+
+Like numpy.einsum.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.bayesflow.monte_carlo.expectation.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.bayesflow.monte_carlo.expectation.md
index f34649987b1..961447937d9 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.bayesflow.monte_carlo.expectation.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.bayesflow.monte_carlo.expectation.md
@@ -14,9 +14,7 @@ User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
 ##### Args:
 
 
-*  <b>`f`</b>: Callable mapping samples from `sampling_dist_q` to `Tensors` with
-    shape broadcastable to `q.batch_shape`.
-    For example, `f` works "just like" `sampling_dist_q.log_prob`.
+*  <b>`f`</b>: Callable mapping samples from `p` to `Tensors`.
 *  <b>`p`</b>: `tf.contrib.distributions.BaseDistribution`.
 *  <b>`z`</b>: `Tensor` of samples from `p`, produced by `p.sample_n`.
 *  <b>`n`</b>: Integer `Tensor`.  Number of samples to generate if `z` is not provided.
@@ -25,5 +23,34 @@ User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
 
 ##### Returns:
 
-  A `Tensor` with same `dtype` as `p`, and shape equal to `p.batch_shape`.
+  A `Tensor` with the same `dtype` as `p`.
+
+
+*  <b>`Example`</b>: 
+
+```python
+N_samples = 10000
+
+distributions = tf.contrib.distributions
+
+dist = distributions.Uniform([0.0, 0.0], [1.0, 2.0])
+elementwise_mean = lambda x: x
+mean_sum = lambda x: tf.reduce_sum(x, 1)
+
+estimate_elementwise_mean_tf = monte_carlo.expectation(elementwise_mean,
+                                                       dist,
+                                                       n=N_samples)
+estimate_mean_sum_tf = monte_carlo.expectation(mean_sum,
+                                               dist,
+                                               n=N_samples)
+
+with tf.Session() as sess:
+  estimate_elementwise_mean, estimate_mean_sum = (
+      sess.run([estimate_elementwise_mean_tf, estimate_mean_sum_tf]))
+print estimate_elementwise_mean
+>>> np.array([ 0.50018013  1.00097895], dtype=np.float32)
+print estimate_mean_sum
+>>> 1.49571
+
+```
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.CrfForwardRnnCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.CrfForwardRnnCell.md
new file mode 100644
index 00000000000..a319e9bead4
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.CrfForwardRnnCell.md
@@ -0,0 +1,73 @@
+Computes the alpha values in a linear-chain CRF.
+
+See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
+- - -
+
+#### `tf.contrib.crf.CrfForwardRnnCell.__call__(inputs, state, scope=None)` {#CrfForwardRnnCell.__call__}
+
+Build the CrfForwardRnnCell.
+
+##### Args:
+
+
+*  <b>`inputs`</b>: A [batch_size, num_tags] matrix of unary potentials.
+*  <b>`state`</b>: A [batch_size, num_tags] matrix containing the previous alpha
+      values.
+*  <b>`scope`</b>: Unused variable scope of this cell.
+
+##### Returns:
+
+  new_alphas, new_alphas: A pair of [batch_size, num_tags] matrices
+      values containing the new alpha values.
+
+
+- - -
+
+#### `tf.contrib.crf.CrfForwardRnnCell.__init__(transition_params)` {#CrfForwardRnnCell.__init__}
+
+Initialize the CrfForwardRnnCell.
+
+##### Args:
+
+
+*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
+      This matrix is expanded into a [1, num_tags, num_tags] in preparation
+      for the broadcast summation occurring within the cell.
+
+
+- - -
+
+#### `tf.contrib.crf.CrfForwardRnnCell.output_size` {#CrfForwardRnnCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.crf.CrfForwardRnnCell.state_size` {#CrfForwardRnnCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.crf.CrfForwardRnnCell.zero_state(batch_size, dtype)` {#CrfForwardRnnCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.crf_binary_score.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.crf_binary_score.md
new file mode 100644
index 00000000000..956f52766de
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.crf.crf_binary_score.md
@@ -0,0 +1,16 @@
+### `tf.contrib.crf.crf_binary_score(tag_indices, sequence_lengths, transition_params)` {#crf_binary_score}
+
+Computes the binary scores of tag sequences.
+
+##### Args:
+
+
+*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices.
+*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
+
+##### Returns:
+
+
+*  <b>`binary_scores`</b>: A [batch_size] vector of binary scores.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_auc.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_auc.md
index 33576e24033..86e7943000a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_auc.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_auc.md
@@ -27,7 +27,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 *  <b>`predictions`</b>: A floating point `Tensor` of arbitrary shape and whose values
     are in the range `[0, 1]`.
 *  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`num_thresholds`</b>: The number of thresholds to use when discretizing the roc
     curve.
 *  <b>`metrics_collections`</b>: An optional list of collections that `auc` should be
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean.md
index 9841f044e6d..9a510c3e6a8 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean.md
@@ -18,7 +18,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 
 *  <b>`values`</b>: A `Tensor` of arbitrary dimensions.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `values`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `values`.
 *  <b>`metrics_collections`</b>: An optional list of collections that `mean`
     should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean_relative_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean_relative_error.md
index f2116c04344..794aff441e8 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean_relative_error.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.metrics.streaming_mean_relative_error.md
@@ -24,7 +24,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 *  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
 *  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
 *  <b>`normalizer`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`metrics_collections`</b>: An optional list of collections that
     `mean_relative_error` should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.required_space_to_batch_paddings.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.required_space_to_batch_paddings.md
new file mode 100644
index 00000000000..ac3bd931fb0
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.required_space_to_batch_paddings.md
@@ -0,0 +1,35 @@
+### `tf.required_space_to_batch_paddings(input_shape, block_shape, base_paddings=None, name=None)` {#required_space_to_batch_paddings}
+
+Calculate padding required to make block_shape divide input_shape.
+
+This function can be used to calculate a suitable paddings argument for use
+with space_to_batch_nd and batch_to_space_nd.
+
+##### Args:
+
+
+*  <b>`input_shape`</b>: int32 Tensor of shape [N].
+*  <b>`block_shape`</b>: int32 Tensor of shape [N].
+*  <b>`base_paddings`</b>: Optional int32 Tensor of shape [N, 2].  Specifies the minimum
+    amount of padding to use.  All elements must be >= 0.  If not specified,
+    defaults to 0.
+*  <b>`name`</b>: string.  Optional name prefix.
+
+##### Returns:
+
+  (paddings, crops), where:
+
+  `paddings` and `crops` are int32 Tensors of rank 2 and shape [N, 2]
+
+*  <b>`satisfying`</b>: 
+
+      paddings[i, 0] = base_paddings[i, 0].
+      0 <= paddings[i, 1] - base_paddings[i, 1] < block_shape[i]
+      (input_shape[i] + paddings[i, 0] + paddings[i, 1]) % block_shape[i] == 0
+
+      crops[i, 0] = 0
+      crops[i, 1] = paddings[i, 1] - base_paddings[i, 1]
+
+
+*  <b>`Raises`</b>: ValueError if called with incompatible shapes.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.space_to_batch_nd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.space_to_batch_nd.md
new file mode 100644
index 00000000000..92175726fee
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.space_to_batch_nd.md
@@ -0,0 +1,134 @@
+### `tf.space_to_batch_nd(input, block_shape, paddings, name=None)` {#space_to_batch_nd}
+
+SpaceToBatch for N-D tensors of type T.
+
+This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+grid of blocks of shape `block_shape`, and interleaves these blocks with the
+"batch" dimension (0) such that in the output, the spatial dimensions
+`[1, ..., M]` correspond to the position within the grid, and the batch
+dimension combines both the position within a spatial block and the original
+batch position.  Prior to division into blocks, the spatial dimensions of the
+input are optionally zero padded according to `paddings`.  See below for a
+precise description.
+
+##### Args:
+
+
+*  <b>`input`</b>: A `Tensor`.
+    N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+    where spatial_shape has `M` dimensions.
+*  <b>`block_shape`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+    1-D with shape `[M]`, all values must be >= 1.
+*  <b>`paddings`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+    2-D with shape `[M, 2]`, all values must be >= 0.
+      `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+      `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+      `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
+
+    This operation is equivalent to the following steps:
+
+    1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+       input according to `paddings` to produce `padded` of shape `padded_shape`.
+
+    2. Reshape `padded` to `reshaped_padded` of shape:
+         [batch] +
+         [padded_shape[1] / block_shape[0],
+           block_shape[0],
+          ...,
+          padded_shape[M] / block_shape[M-1],
+          block_shape[M-1]] +
+         remaining_shape
+
+    3. Permute dimensions of `reshaped_padded` to produce
+       `permuted_reshaped_padded` of shape:
+         block_shape +
+         [batch] +
+         [padded_shape[1] / block_shape[0],
+          ...,
+          padded_shape[M] / block_shape[M-1]] +
+         remaining_shape
+
+    4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+       dimension, producing an output tensor of shape:
+         [batch * prod(block_shape)] +
+         [padded_shape[1] / block_shape[0],
+          ...,
+          padded_shape[M] / block_shape[M-1]] +
+         remaining_shape
+
+    Some examples:
+
+    (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+        `paddings = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    x = [[[[1], [2]], [[3], [4]]]]
+    ```
+
+    The output tensor has shape `[4, 1, 1, 1]` and value:
+
+    ```prettyprint
+    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+    ```
+
+    (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+        `paddings = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    x = [[[[1, 2, 3], [4, 5, 6]],
+          [[7, 8, 9], [10, 11, 12]]]]
+    ```
+
+    The output tensor has shape `[4, 1, 1, 3]` and value:
+
+    ```prettyprint
+    [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+    ```
+
+    (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+        `paddings = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    x = [[[[1],   [2],  [3],  [4]],
+          [[5],   [6],  [7],  [8]],
+          [[9],  [10], [11],  [12]],
+          [[13], [14], [15],  [16]]]]
+    ```
+
+    The output tensor has shape `[4, 2, 2, 1]` and value:
+
+    ```prettyprint
+    x = [[[[1], [3]], [[5], [7]]],
+         [[[2], [4]], [[10], [12]]],
+         [[[5], [7]], [[13], [15]]],
+         [[[6], [8]], [[14], [16]]]]
+    ```
+
+    (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+        paddings = `[[0, 0], [2, 0]]`:
+
+    ```prettyprint
+    x = [[[[1],   [2],  [3],  [4]],
+          [[5],   [6],  [7],  [8]]],
+         [[[9],  [10], [11],  [12]],
+          [[13], [14], [15],  [16]]]]
+    ```
+
+    The output tensor has shape `[8, 1, 3, 1]` and value:
+
+    ```prettyprint
+    x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+         [[[0], [2], [4]]], [[[0], [10], [12]]],
+         [[[0], [5], [7]]], [[[0], [13], [15]]],
+         [[[0], [6], [8]]], [[[0], [14], [16]]]]
+    ```
+
+    Among others, this operation is useful for reducing atrous convolution into
+    regular convolution.
+
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor`. Has the same type as `input`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.separable_convolution2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.separable_convolution2d.md
index 27da76601f1..e84733263de 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.separable_convolution2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.layers.separable_convolution2d.md
@@ -24,10 +24,12 @@ to produce the end result.
 *  <b>`stride`</b>: a list of length 2: [stride_height, stride_width], specifying the
     depthwise convolution stride. Can be an int if both strides are the same.
 *  <b>`padding`</b>: one of 'VALID' or 'SAME'.
-*  <b>`activation_fn`</b>: activation function.
+*  <b>`activation_fn`</b>: activation function, set to None to skip it and maintain
+    a linear activation.
 *  <b>`normalizer_fn`</b>: normalization function to use instead of `biases`. If
-    `normalize_fn` is provided then `biases_initializer` and
+    `normalizer_fn` is provided then `biases_initializer` and
     `biases_regularizer` are ignored and `biases` are not created nor added.
+    default set to None for no normalizer function
 *  <b>`normalizer_params`</b>: normalization function parameters.
 *  <b>`weights_initializer`</b>: An initializer for the weights.
 *  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.log_softmax.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.log_softmax.md
index ac55b177d08..79cb45d9449 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.log_softmax.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.nn.log_softmax.md
@@ -4,7 +4,7 @@ Computes log softmax activations.
 
 For each batch `i` and class `j` we have
 
-    logsoftmax = logits - log(reduce_sum(exp(logits), dim))
+    logsoftmax = logits - reduce_sum(exp(logits), dim)
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md
index 8d05472e340..15045b3b29a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.sparse_concat.md
@@ -83,7 +83,8 @@ Graphically this is equivalent to doing
 ##### Args:
 
 
-*  <b>`concat_dim`</b>: Dimension to concatenate along.
+*  <b>`concat_dim`</b>: Dimension to concatenate along. Must be in range [-rank, rank),
+    where rank is the number of dimensions in each input `SparseTensor`.
 *  <b>`sp_inputs`</b>: List of `SparseTensor` to concatenate.
 *  <b>`name`</b>: A name prefix for the returned tensors (optional).
 *  <b>`expand_nonconcat_dim`</b>: Whether to allow the expansion in the non-concat
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.crf.viterbi_decode.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.crf.viterbi_decode.md
new file mode 100644
index 00000000000..5e478f9c31d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.crf.viterbi_decode.md
@@ -0,0 +1,19 @@
+### `tf.contrib.crf.viterbi_decode(score, transition_params)` {#viterbi_decode}
+
+Decode the highest scoring sequence of tags outside of TensorFlow.
+
+This should only be used at test time.
+
+##### Args:
+
+
+*  <b>`score`</b>: A [seq_len, num_tags] matrix of unary potentials.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] matrix of binary potentials.
+
+##### Returns:
+
+
+*  <b>`viterbi`</b>: A [seq_len] list of integers containing the highest scoring tag
+      indicies.
+*  <b>`viterbi_score`</b>: A float containing the score for the viterbi sequence.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md
index ad852cc6080..284f8abd621 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.batch_norm.md
@@ -15,7 +15,7 @@ they need to be added as a dependency to the train_op, example:
 
   update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
   if update_ops:
-    updates = tf.group(update_ops)
+    updates = tf.group(*update_ops)
     total_loss = control_flow_ops.with_dependencies([updates], total_loss)
 
 One can set update_collections=None to force the updates in place, but that
@@ -32,7 +32,8 @@ can have speed penalty, specially in distributed settings.
     not used. When the next layer is linear (also e.g. `nn.relu`), this can be
     disabled since the scaling can be done by the next layer.
 *  <b>`epsilon`</b>: small float added to variance to avoid dividing by zero.
-*  <b>`activation_fn`</b>: Optional activation function.
+*  <b>`activation_fn`</b>: activation function, default set to None to skip it and
+    maintain a linear activation.
 *  <b>`updates_collections`</b>: collections to collect the update ops for computation.
     The updates_ops need to be excuted with the train_op.
     If None, a control dependency would be added to make sure the updates are
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
index fc7269d91e5..745dbdea79b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
@@ -92,19 +92,21 @@ Initializes a DNNClassifier instance.
 
   A `DNNClassifier` estimator.
 
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.__repr__()` {#DNNClassifier.__repr__}
+##### Raises:
 
 
+*  <b>`ValueError`</b>: If `n_classes` < 2.
 
 
 - - -
 
 #### `tf.contrib.learn.DNNClassifier.bias_` {#DNNClassifier.bias_}
 
+DEPRECATED FUNCTION
 
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-13.
+Instructions for updating:
+This method inspects the private state of the object, and should not be used
 
 
 - - -
@@ -114,100 +116,25 @@ Initializes a DNNClassifier instance.
 
 
 
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.dnn_bias_` {#DNNClassifier.dnn_bias_}
-
-Returns bias of deep neural network part.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.dnn_weights_` {#DNNClassifier.dnn_weights_}
-
-Returns weights of deep neural network part.
-
-
 - - -
 
 #### `tf.contrib.learn.DNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNClassifier.evaluate}
 
-See `Evaluable`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
+See evaluable.Evaluable.
 
 
 - - -
 
-#### `tf.contrib.learn.DNNClassifier.export(*args, **kwargs)` {#DNNClassifier.export}
+#### `tf.contrib.learn.DNNClassifier.export(export_dir, input_fn=None, input_feature_key=None, use_deprecated_input_fn=True, signature_fn=None, default_batch_size=1, exports_to_keep=None)` {#DNNClassifier.export}
 
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn and input_feature_key will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, targets), where features is a dict of
-        string key to `Tensor` and targets is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input.
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      prediction_key: The key for a tensor in the `predictions` dict (output
-        from the `model_fn`) to use as the `predictions` input to the
-        `signature_fn`. Optional. If `None`, predictions will pass to
-        `signature_fn` without filtering.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
+See BasEstimator.export.
 
 
 - - -
 
 #### `tf.contrib.learn.DNNClassifier.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNClassifier.fit}
 
-See `Trainable`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If `x` or `y` are not `None` while `input_fn` is not `None`.
-*  <b>`ValueError`</b>: If both `steps` and `max_steps` are not `None`.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.get_params(deep=True)` {#DNNClassifier.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
+See trainable.Trainable.
 
 
 - - -
@@ -234,21 +161,7 @@ Returns value of the variable given by name.
 
 ##### Returns:
 
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.linear_bias_` {#DNNClassifier.linear_bias_}
-
-Returns bias of the linear part.
-
-
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.linear_weights_` {#DNNClassifier.linear_weights_}
-
-Returns weights per feature of the linear part.
+  `Tensor` object.
 
 
 - - -
@@ -258,49 +171,6 @@ Returns weights per feature of the linear part.
 
 
 
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.partial_fit(x=None, y=None, input_fn=None, steps=1, batch_size=None, monitors=None)` {#DNNClassifier.partial_fit}
-
-Incremental fit on a batch of samples.
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
 - - -
 
 #### `tf.contrib.learn.DNNClassifier.predict(*args, **kwargs)` {#DNNClassifier.predict}
@@ -353,36 +223,14 @@ altogether. The behavior of this flag is described below.
       probabilities if as_iterable is True).
 
 
-- - -
-
-#### `tf.contrib.learn.DNNClassifier.set_params(**params)` {#DNNClassifier.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
 - - -
 
 #### `tf.contrib.learn.DNNClassifier.weights_` {#DNNClassifier.weights_}
 
-
+DEPRECATED FUNCTION
+
+THIS FUNCTION IS DEPRECATED. It will be removed after 2016-10-13.
+Instructions for updating:
+This method inspects the private state of the object, and should not be used
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
index 2945522b83c..68ba0ee73bf 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.metrics.streaming_recall_at_k.md
@@ -29,7 +29,7 @@ Instructions for updating:
       `int64`.
     k: The number of top elements to look at for computing recall.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall_at_k`
       should be added to.
     updates_collections: An optional list of collections `update_op` should be
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
index 93fe03b2d78..1998f21e154 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
@@ -21,16 +21,16 @@ output of `softmax`, as it will produce incorrect results.
 A common use case is to have logits of shape `[batch_size, num_classes]` and
 labels of shape `[batch_size]`. But higher dimensions are supported.
 
-##### Args:
+Args:
 
-
-*  <b>`logits`</b>: Unscaled log probabilities of rank `r` and shape
+  logits: Unscaled log probabilities of rank `r` and shape
     `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
-*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
+  labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
     `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
-    Other values will result in a loss of 0, but incorrect gradient
-    computations.
-*  <b>`name`</b>: A name for the operation (optional).
+    Other values will raise an exception when this op is run on CPU, and
+    return `NaN` for corresponding corresponding loss and gradient rows
+    on GPU.
+  name: A name for the operation (optional).
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.crf.crf_log_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.crf.crf_log_norm.md
new file mode 100644
index 00000000000..830a38940fe
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.crf.crf_log_norm.md
@@ -0,0 +1,17 @@
+### `tf.contrib.crf.crf_log_norm(inputs, sequence_lengths, transition_params)` {#crf_log_norm}
+
+Computes the normalization for a CRF.
+
+##### Args:
+
+
+*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+      to use as input to the CRF layer.
+*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix.
+
+##### Returns:
+
+
+*  <b>`log_norm`</b>: A [batch_size] vector of normalizers for a CRF.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.convolution2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.convolution2d.md
index c98765e2146..62e968c804b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.convolution2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.convolution2d.md
@@ -27,10 +27,12 @@ greater than one.
 *  <b>`rate`</b>: integer. If less than or equal to 1, a standard convolution is used.
     If greater than 1, than the a'trous convolution is applied and `stride`
     must be set to 1.
-*  <b>`activation_fn`</b>: activation function.
+*  <b>`activation_fn`</b>: activation function, set to None to skip it and maintain
+    a linear activation.
 *  <b>`normalizer_fn`</b>: normalization function to use instead of `biases`. If
-    `normalize_fn` is provided then `biases_initializer` and
+    `normalizer_fn` is provided then `biases_initializer` and
     `biases_regularizer` are ignored and `biases` are not created nor added.
+    default set to None for no normalizer function
 *  <b>`normalizer_params`</b>: normalization function parameters.
 *  <b>`weights_initializer`</b>: An initializer for the weights.
 *  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.fully_connected.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.fully_connected.md
index c10dd0eb2de..e121754e8a5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.fully_connected.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.layers.fully_connected.md
@@ -19,10 +19,12 @@ prior to the initial matrix multiply by `weights`.
 *  <b>`inputs`</b>: A tensor of with at least rank 2 and value for the last dimension,
     i.e. `[batch_size, depth]`, `[None, None, None, channels]`.
 *  <b>`num_outputs`</b>: Integer or long, the number of output units in the layer.
-*  <b>`activation_fn`</b>: activation function.
+*  <b>`activation_fn`</b>: activation function, set to None to skip it and maintain
+    a linear activation.
 *  <b>`normalizer_fn`</b>: normalization function to use instead of `biases`. If
-    `normalize_fn` is provided then `biases_initializer` and
+    `normalizer_fn` is provided then `biases_initializer` and
     `biases_regularizer` are ignored and `biases` are not created nor added.
+    default set to None for no normalizer function
 *  <b>`normalizer_params`</b>: normalization function parameters.
 *  <b>`weights_initializer`</b>: An initializer for the weights.
 *  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md
index 3b0b62c7206..ccf6097f59e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.metrics.streaming_percentage_less.md
@@ -24,7 +24,7 @@ Instructions for updating:
     values: A numeric `Tensor` of arbitrary size.
     threshold: A scalar threshold.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `values`.
-    weights: An optional `Tensor` whose shape matches `values`.
+    weights: An optional `Tensor` whose shape is broadcastable to `values`.
     metrics_collections: An optional list of collections that the metric
       value variable should be added to.
     updates_collections: An optional list of collections that the metric update
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.model_variables.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.model_variables.md
new file mode 100644
index 00000000000..f0bba3c637e
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.model_variables.md
@@ -0,0 +1,8 @@
+### `tf.model_variables()` {#model_variables}
+
+Returns all variables in the MODEL_VARIABLES collection.
+
+##### Returns:
+
+  A list of local Variable objects.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.sparse_segment_sqrt_n_grad.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.sparse_segment_sqrt_n_grad.md
deleted file mode 100644
index 86fb7f95c58..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.sparse_segment_sqrt_n_grad.md
+++ /dev/null
@@ -1,24 +0,0 @@
-### `tf.sparse_segment_sqrt_n_grad(grad, indices, segment_ids, output_dim0, name=None)` {#sparse_segment_sqrt_n_grad}
-
-Computes gradients for SparseSegmentSqrtN.
-
-Returns tensor "output" with same shape as grad, except for dimension 0 whose
-value is output_dim0.
-
-##### Args:
-
-
-*  <b>`grad`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    gradient propagated to the SparseSegmentSqrtN op.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    indices passed to the corresponding SparseSegmentSqrtN op.
-*  <b>`segment_ids`</b>: A `Tensor` of type `int32`.
-    segment_ids passed to the corresponding SparseSegmentSqrtN op.
-*  <b>`output_dim0`</b>: A `Tensor` of type `int32`.
-    dimension 0 of "data" passed to SparseSegmentSqrtN op.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `grad`.
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_sequence_score.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_sequence_score.md
new file mode 100644
index 00000000000..95cbf2e8eb2
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_sequence_score.md
@@ -0,0 +1,19 @@
+### `tf.contrib.crf.crf_sequence_score(inputs, tag_indices, sequence_lengths, transition_params)` {#crf_sequence_score}
+
+Computes the unnormalized score for a tag sequence.
+
+##### Args:
+
+
+*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+      to use as input to the CRF layer.
+*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices for which we
+      compute the unnormalized score.
+*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
+*  <b>`transition_params`</b>: A [num_tags, num_tags] transition matrix.
+
+##### Returns:
+
+
+*  <b>`sequence_scores`</b>: A [batch_size] vector of unnormalized sequence scores.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_unary_score.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_unary_score.md
new file mode 100644
index 00000000000..4a344623ce2
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.crf.crf_unary_score.md
@@ -0,0 +1,16 @@
+### `tf.contrib.crf.crf_unary_score(tag_indices, sequence_lengths, inputs)` {#crf_unary_score}
+
+Computes the unary scores of tag sequences.
+
+##### Args:
+
+
+*  <b>`tag_indices`</b>: A [batch_size, max_seq_len] matrix of tag indices.
+*  <b>`sequence_lengths`</b>: A [batch_size] vector of true sequence lengths.
+*  <b>`inputs`</b>: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
+
+##### Returns:
+
+
+*  <b>`unary_scores`</b>: A [batch_size] vector of unary scores.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md
index c814615417e..45eaf48ba4e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_mean_iou.md
@@ -31,7 +31,7 @@ Instructions for updating:
       have. This value must be provided, since a confusion matrix of
       dimension = [num_classes, num_classes] will be allocated.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `mean_iou`
       should be added to.
     updates_collections: An optional list of collections `update_op` should be
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md
index 6a5f6569c26..e93630f46c1 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_recall.md
@@ -24,7 +24,7 @@ Instructions for updating:
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `recall` should
       be added to.
     updates_collections: An optional list of collections that `update_op` should
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_root_mean_squared_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_root_mean_squared_error.md
index 8fc8f0034f6..0a2f4686599 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_root_mean_squared_error.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.metrics.streaming_root_mean_squared_error.md
@@ -23,7 +23,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 *  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
 *  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`metrics_collections`</b>: An optional list of collections that
     `root_mean_squared_error` should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.softmax.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.softmax.md
index 65da6889d97..ae969ea675a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.softmax.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.softmax.md
@@ -1,6 +1,6 @@
 ### `tf.nn.softmax(logits, dim=-1, name=None)` {#softmax}
 
-Computes softmax activations.
+Computes log softmax activations.
 
 For each batch `i` and class `j` we have
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
index 014d2792b6b..ac638150774 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
@@ -5,8 +5,8 @@ Get the KL-divergence KL(dist_a || dist_b).
 ##### Args:
 
 
-*  <b>`dist_a`</b>: instance of distributions.Distribution.
-*  <b>`dist_b`</b>: instance of distributions.Distribution.
+*  <b>`dist_a`</b>: The first distribution.
+*  <b>`dist_b`</b>: The second distribution.
 *  <b>`allow_nan`</b>: If `False` (default), a runtime error is raised
     if the KL returns NaN values for any batch entry of the given
     distributions.  If `True`, the KL may return a NaN for the given entry.
@@ -19,7 +19,6 @@ Get the KL-divergence KL(dist_a || dist_b).
 ##### Raises:
 
 
-*  <b>`TypeError`</b>: If dist_a or dist_b is not an instance of Distribution.
 *  <b>`NotImplementedError`</b>: If no KL method is defined for distribution types
     of dist_a and dist_b.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.convolution2d_transpose.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.convolution2d_transpose.md
index 950e2a0ffb7..cbc3e50fb8c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.convolution2d_transpose.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.convolution2d_transpose.md
@@ -17,10 +17,12 @@ second variable called 'biases' is added to the result of the operation.
     Can be an int if both strides are the same.  Note that presently
     both strides must have the same value.
 *  <b>`padding`</b>: one of 'VALID' or 'SAME'.
-*  <b>`activation_fn`</b>: activation function.
+*  <b>`activation_fn`</b>: activation function, set to None to skip it and maintain
+    a linear activation.
 *  <b>`normalizer_fn`</b>: normalization function to use instead of `biases`. If
-    `normalize_fn` is provided then `biases_initializer` and
+    `normalizer_fn` is provided then `biases_initializer` and
     `biases_regularizer` are ignored and `biases` are not created nor added.
+    default set to None for no normalizer function
 *  <b>`normalizer_params`</b>: normalization function parameters.
 *  <b>`weights_initializer`</b>: An initializer for the weights.
 *  <b>`weights_regularizer`</b>: Optional regularizer for the weights.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.layer_norm.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.layer_norm.md
index 22909faa40d..21ef48376c9 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.layer_norm.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.layers.layer_norm.md
@@ -17,7 +17,8 @@ Can be used as a normalizer function for conv2d and fully_connected.
 *  <b>`scale`</b>: If True, multiply by `gamma`. If False, `gamma` is
     not used. When the next layer is linear (also e.g. `nn.relu`), this can be
     disabled since the scaling can be done by the next layer.
-*  <b>`activation_fn`</b>: Optional activation function.
+*  <b>`activation_fn`</b>: activation function, default set to None to skip it and
+    maintain a linear activation.
 *  <b>`reuse`</b>: whether or not the layer and its variables should be reused. To be
     able to reuse the layer scope must be given.
 *  <b>`variables_collections`</b>: optional collections for the variables.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_mean_absolute_error.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_mean_absolute_error.md
index 436d7323885..d8e38b275e5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_mean_absolute_error.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_mean_absolute_error.md
@@ -23,7 +23,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
 *  <b>`predictions`</b>: A `Tensor` of arbitrary shape.
 *  <b>`labels`</b>: A `Tensor` of the same shape as `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`metrics_collections`</b>: An optional list of collections that
     `mean_absolute_error` should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md
index bd9701b1b75..0afe30d1899 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_precision.md
@@ -26,7 +26,7 @@ Instructions for updating:
     labels: The ground truth values, a `bool` `Tensor` whose dimensions must
       match `predictions`.
     ignore_mask: An optional, `bool` `Tensor` whose shape matches `predictions`.
-    weights: An optional `Tensor` whose shape matches `predictions`.
+    weights: An optional `Tensor` whose shape is broadcastable to `predictions`.
     metrics_collections: An optional list of collections that `precision` should
       be added to.
     updates_collections: An optional list of collections that `update_op` should
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_sensitivity_at_specificity.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_sensitivity_at_specificity.md
index 36bc814bc18..4ab80ec260e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_sensitivity_at_specificity.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_sensitivity_at_specificity.md
@@ -26,7 +26,7 @@ following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
     are in the range `[0, 1]`.
 *  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
 *  <b>`specificity`</b>: A scalar value in range `[0, 1]`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`num_thresholds`</b>: The number of thresholds to use for matching the given
     specificity.
 *  <b>`metrics_collections`</b>: An optional list of collections that `sensitivity`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_specificity_at_sensitivity.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_specificity_at_sensitivity.md
index 1ff92410371..abbfb3a5a9a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_specificity_at_sensitivity.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.metrics.streaming_specificity_at_sensitivity.md
@@ -26,7 +26,7 @@ following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
     are in the range `[0, 1]`.
 *  <b>`labels`</b>: A `bool` `Tensor` whose shape matches `predictions`.
 *  <b>`sensitivity`</b>: A scalar value in range `[0, 1]`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`num_thresholds`</b>: The number of thresholds to use for matching the given
     sensitivity.
 *  <b>`metrics_collections`</b>: An optional list of collections that `specificity`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SummaryWriter.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SummaryWriter.md
index 8d32d95f798..a18d0f41b67 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SummaryWriter.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.train.SummaryWriter.md
@@ -148,6 +148,13 @@ Adds a metadata information for a single session.run() call.
 *  <b>`ValueError`</b>: If the provided tag was already used for this type of event.
 
 
+- - -
+
+#### `tf.train.SummaryWriter.get_logdir()` {#SummaryWriter.get_logdir}
+
+Returns the directory where event file will be written.
+
+
 
 - - -
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assign_sub.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assign_sub.md
new file mode 100644
index 00000000000..b24da4db87a
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assign_sub.md
@@ -0,0 +1,24 @@
+### `tf.assign_sub(ref, value, use_locking=None, name=None)` {#assign_sub}
+
+Update 'ref' by subtracting 'value' from it.
+
+This operation outputs "ref" after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+##### Args:
+
+
+*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+    Should be from a `Variable` node.
+*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
+    The value to be subtracted to the variable.
+*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
+    If True, the subtraction will be protected by a lock;
+    otherwise the behavior is undefined, but may exhibit less contention.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  Same as "ref".  Returned as a convenience for operations that want
+  to use the new value after the variable has been updated.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.batch_to_space_nd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.batch_to_space_nd.md
new file mode 100644
index 00000000000..40f53ba775d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.batch_to_space_nd.md
@@ -0,0 +1,136 @@
+### `tf.batch_to_space_nd(input, block_shape, crops, name=None)` {#batch_to_space_nd}
+
+BatchToSpace for N-D tensors of type T.
+
+This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+`block_shape + [batch]`, interleaves these blocks back into the grid defined by
+the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+the input.  The spatial dimensions of this intermediate result are then
+optionally cropped according to `crops` to produce the output.  This is the
+reverse of SpaceToBatch.  See below for a precise description.
+
+##### Args:
+
+
+*  <b>`input`</b>: A `Tensor`.
+    N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+    where spatial_shape has M dimensions.
+*  <b>`block_shape`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+    1-D with shape `[M]`, all values must be >= 1.
+*  <b>`crops`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+    2-D with shape `[M, 2]`, all values must be >= 0.
+      `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+      dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+      required that
+      `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+
+    This operation is equivalent to the following steps:
+
+    1. Reshape `input` to `reshaped` of shape:
+         [block_shape[0], ..., block_shape[M-1],
+          batch / prod(block_shape),
+          input_shape[1], ..., input_shape[N-1]]
+
+    2. Permute dimensions of `reshaped` to produce `permuted` of shape
+         [batch / prod(block_shape),
+
+          input_shape[1], block_shape[0],
+          ...,
+          input_shape[M], block_shape[M-1],
+
+          input_shape[M+1], ..., input_shape[N-1]]
+
+    3. Reshape `permuted` to produce `reshaped_permuted` of shape
+         [batch / prod(block_shape),
+
+          input_shape[1] * block_shape[0],
+          ...,
+          input_shape[M] * block_shape[M-1],
+
+          input_shape[M+1],
+          ...,
+          input_shape[N-1]]
+
+    4. Crop the start and end of dimensions `[1, ..., M]` of
+       `reshaped_permuted` according to `crops` to produce the output of shape:
+         [batch / prod(block_shape),
+
+          input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+          ...,
+          input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+
+          input_shape[M+1], ..., input_shape[N-1]]
+
+    Some examples:
+
+    (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+        `crops = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+    ```
+
+    The output tensor has shape `[1, 2, 2, 1]` and value:
+
+    ```prettyprint
+    x = [[[[1], [2]], [[3], [4]]]]
+    ```
+
+    (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+        `crops = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+    ```
+
+    The output tensor has shape `[1, 2, 2, 3]` and value:
+
+    ```prettyprint
+    x = [[[[1, 2, 3], [4, 5, 6]],
+          [[7, 8, 9], [10, 11, 12]]]]
+    ```
+
+    (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+        `crops = [[0, 0], [0, 0]]`:
+
+    ```prettyprint
+    x = [[[[1], [3]], [[5], [7]]],
+         [[[2], [4]], [[10], [12]]],
+         [[[5], [7]], [[13], [15]]],
+         [[[6], [8]], [[14], [16]]]]
+    ```
+
+    The output tensor has shape `[1, 4, 4, 1]` and value:
+
+    ```prettyprint
+    x = [[[1],   [2],  [3],  [4]],
+         [[5],   [6],  [7],  [8]],
+         [[9],  [10], [11],  [12]],
+         [[13], [14], [15],  [16]]]
+    ```
+
+    (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+        `crops = [[0, 0], [2, 0]]`:
+
+    ```prettyprint
+    x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+         [[[0], [2], [4]]], [[[0], [10], [12]]],
+         [[[0], [5], [7]]], [[[0], [13], [15]]],
+         [[[0], [6], [8]]], [[[0], [14], [16]]]]
+    ```
+
+    The output tensor has shape `[2, 2, 4, 1]` and value:
+
+    ```prettyprint
+    x = [[[[1],   [2],  [3],  [4]],
+          [[5],   [6],  [7],  [8]]],
+         [[[9],  [10], [11],  [12]],
+          [[13], [14], [15],  [16]]]]
+    ```
+
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor`. Has the same type as `input`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md
index 1ff9f5bddc3..90bf5b958ba 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.distributions.Mixture.md
@@ -17,45 +17,17 @@ mixture probabilities) and a list of `Distribution` objects
 all having matching dtype, batch shape, event shape, and continuity
 properties (the components).
 
-The user does not pass the list of distributions directly, but rather a
-list of `(constructor, batch_tensor_params_dict)` pairs,
-called `components`. The list of distributions is created via:
-
-```python
-distributions = [
-  c(**params_dict) for (c, params_dict) in zip(*components)
-]
-```
-
-This form allows for certain types of batch-shape optimizations within
-this class.
-
-An example of `components`:
-
-```python
-components = [
-  (tf.contrib.distributions.Normal, {"mu": 3.0, "sigma": 1.0}),
-  (functools.partial(tf.contrib.distributions.Normal, validate_args=False),
-   {"mu": 3.0, "sigma": 2.0}),
-  (tf.contrib.distributions.Normal.from_params,
-   {"mu": 1.0, "sigma": -1.0})
-]
-```
-
 The `num_classes` of `cat` must be possible to infer at graph construction
-time and match `len(distributions)`.
+time and match `len(components)`.
 
 ##### Args:
 
 
 *  <b>`cat`</b>: A `Categorical` distribution instance, representing the probabilities
       of `distributions`.
-*  <b>`components`</b>: A list or tuple of `(constructor, batch_tensor_params)`
-    tuples.  The `constructor` must be a callable, and `batch_tensor_params`
-    must be a dict mapping constructor kwargs to batchwise parameters.
-    Each `Distribution` instance created by calling
-    `constructor(**batch_tensor_params)` must have the same type, be defined
-    on the same domain, and have matching `event_shape` and `batch_shape`.
+*  <b>`components`</b>: A list or tuple of `Distribution` instances.
+    Each instance must have the same type, be defined on the same domain,
+    and have matching `event_shape` and `batch_shape`.
 *  <b>`validate_args`</b>: `Boolean`, default `False`.  If `True`, raise a runtime
     error if batch or event ranks are inconsistent between cat and any of
     the distributions.  This is only checked if the ranks cannot be
@@ -71,16 +43,13 @@ time and match `len(distributions)`.
 
 *  <b>`TypeError`</b>: If cat is not a `Categorical`, or `components` is not
     a list or tuple, or the elements of `components` are not
-    tuples of the form `(callable, dict)`, or the objects resulting
-    from calling `callable(**dict)` are not instances of `Distribution`, or
-    the resulting instances of `Distribution` do not have matching
-    continuity properties, or do not have matching `dtype`.
-*  <b>`ValueError`</b>: If `components` is an empty list or tuple, or the
-    distributions created from `components` do have a statically known event
-    rank.  If `cat.num_classes` cannot be inferred at graph creation time,
+    instances of `Distribution`, or do not have matching `dtype`.
+*  <b>`ValueError`</b>: If `components` is an empty list or tuple, or its
+    elements do not have a statically known event rank.
+    If `cat.num_classes` cannot be inferred at graph creation time,
     or the constant value of `cat.num_classes` is not equal to
-    `len(distributions)`, or all `distributions` and `cat` do not have
-    matching static batch shapes, or all components' distributions do not
+    `len(components)`, or all `components` and `cat` do not have
+    matching static batch shapes, or all components do not
     have matching static event shapes.
 
 
@@ -159,7 +128,7 @@ cdf(x) := P[X <= x]
 
 - - -
 
-#### `tf.contrib.distributions.Mixture.distributions` {#Mixture.distributions}
+#### `tf.contrib.distributions.Mixture.components` {#Mixture.components}
 
 
 
@@ -185,7 +154,7 @@ Shanon entropy in nats.
 A lower bound on the entropy of this mixture model.
 
 The bound below is not always very tight, and its usefulness depends
-on the mixture probabilities and the distributions in use.
+on the mixture probabilities and the components in use.
 
 A lower bound is useful for ELBO when the `Mixture` is the variational
 distribution:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_accuracy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_accuracy.md
index 590f6a9dadd..342d6c622b9 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_accuracy.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_accuracy.md
@@ -23,7 +23,7 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 *  <b>`predictions`</b>: The predicted values, a `Tensor` of any shape.
 *  <b>`labels`</b>: The ground truth values, a `Tensor` whose shape matches
     `predictions`.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`.
 *  <b>`metrics_collections`</b>: An optional list of collections that `accuracy` should
     be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that `update_op` should
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_mean_cosine_distance.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_mean_cosine_distance.md
index 585f2545168..822f39e19e5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_mean_cosine_distance.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_mean_cosine_distance.md
@@ -20,8 +20,8 @@ If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 *  <b>`predictions`</b>: A `Tensor` of the same shape as `labels`.
 *  <b>`labels`</b>: A `Tensor` of arbitrary shape.
 *  <b>`dim`</b>: The dimension along which the cosine distance is computed.
-*  <b>`weights`</b>: An optional `Tensor` whose shape matches `predictions`, and whose
-    dimension `dim` is 1.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to `predictions`,
+    and whose dimension `dim` is 1.
 *  <b>`metrics_collections`</b>: An optional list of collections that the metric
     value variable should be added to.
 *  <b>`updates_collections`</b>: An optional list of collections that the metric update
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_pearson_correlation.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_pearson_correlation.md
index 3c8a3a57567..49d7f71d550 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_pearson_correlation.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_pearson_correlation.md
@@ -43,7 +43,7 @@ https://wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance
 ##### Raises:
 
 
-*  <b>`ValueError`</b>: If labels and predictions are of different sizes or if the
-    ignore_mask is of the wrong size or if either `metrics_collections` or
-    `updates_collections` are not a list or tuple.
+*  <b>`ValueError`</b>: If `labels` and `predictions` are of different sizes, or if
+    `weights` is the wrong size, or if either `metrics_collections` or
+    `updates_collections` are not a `list` or `tuple`.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_sparse_average_precision_at_k.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_sparse_average_precision_at_k.md
new file mode 100644
index 00000000000..01138e58fa0
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.metrics.streaming_sparse_average_precision_at_k.md
@@ -0,0 +1,55 @@
+### `tf.contrib.metrics.streaming_sparse_average_precision_at_k(predictions, labels, k, weights=None, metrics_collections=None, updates_collections=None, name=None)` {#streaming_sparse_average_precision_at_k}
+
+Computes average precision@k of predictions with respect to sparse labels.
+
+See `sparse_average_precision_at_k` for details on formula. `weights` are
+applied to the result of `sparse_average_precision_at_k`
+
+`streaming_sparse_average_precision_at_k` creates two local variables,
+`average_precision_at_<k>/count` and `average_precision_at_<k>/total`, that
+are used to compute the frequency. This frequency is ultimately returned as
+`precision_at_<k>`: an idempotent operation that simply divides
+`true_positive_at_<k>` by total (`true_positive_at_<k>` +
+`false_positive_at_<k>`).
+
+For estimation of the metric over a stream of data, the function creates an
+`update_op` operation that updates these variables and returns the
+`precision_at_<k>`. Internally, a `top_k` operation computes a `Tensor`
+indicating the top `k` `predictions`. Set operations applied to `top_k` and
+`labels` calculate the true positives and false positives weighted by
+`weights`. Then `update_op` increments `true_positive_at_<k>` and
+`false_positive_at_<k>` using these values.
+
+If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+##### Args:
+
+
+*  <b>`predictions`</b>: Float `Tensor` with shape [D1, ... DN, num_classes] where
+    N >= 1. Commonly, N=1 and `predictions` has shape
+    [batch size, num_classes]. The final dimension contains the logit values
+    for each class. [D1, ... DN] must match `labels`.
+*  <b>`labels`</b>: `int64` `Tensor` or `SparseTensor` with shape
+    [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+    target classes for the associated prediction. Commonly, N=1 and `labels`
+    has shape [batch_size, num_labels]. [D1, ... DN] must match
+    `predictions_idx`. Values should be in range [0, num_classes], where
+    num_classes is the last dimension of `predictions`.
+*  <b>`k`</b>: Integer, k for @k metric. This will calculate an average precision for
+    range `[1,k]`, as documented above.
+*  <b>`weights`</b>: An optional `Tensor` whose shape is broadcastable to the the first
+    [D1, ... DN] dimensions of `predictions` and `labels`.
+*  <b>`metrics_collections`</b>: An optional list of collections that values should
+    be added to.
+*  <b>`updates_collections`</b>: An optional list of collections that updates should
+    be added to.
+*  <b>`name`</b>: Name of new update operation, and namespace for other dependant ops.
+
+##### Returns:
+
+
+*  <b>`mean_average_precision`</b>: Scalar `float64` `Tensor` with the mean average
+    precision values.
+*  <b>`update`</b>: `Operation` that increments  variables appropriately, and whose
+    value matches `metric`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.draw_bounding_boxes.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.draw_bounding_boxes.md
index d2e1ab8fae3..fff67cd42f2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.draw_bounding_boxes.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.image.draw_bounding_boxes.md
@@ -9,7 +9,7 @@ bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
 height of the underlying image.
 
 For example, if an image is 100 x 200 pixels and the bounding box is
-`[0.1, 0.5, 0.2, 0.9]`, the bottom-left and upper-right coordinates of the
+`[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
 bounding box will be `(10, 40)` to `(50, 180)`.
 
 Parts of the bounding box may fall outside the image.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sequence_mask.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sequence_mask.md
new file mode 100644
index 00000000000..2599cbc6b00
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.sequence_mask.md
@@ -0,0 +1,30 @@
+### `tf.sequence_mask(lengths, maxlen=None, dtype=tf.bool, name=None)` {#sequence_mask}
+
+Return a mask tensor representing the first N positions of each row.
+
+Example:
+```python
+tf.sequence_mask([1, 3, 2], 5) =
+  [[True, False, False, False, False],
+   [True, True, True, False, False],
+   [True, True, False, False, False]]
+```
+
+##### Args:
+
+
+*  <b>`lengths`</b>: 1D integer tensor, all its values < maxlen.
+*  <b>`maxlen`</b>: scalar integer tensor, maximum length of each row. Default: use
+          maximum over lengths.
+*  <b>`dtype`</b>: output type of the resulting tensor.
+*  <b>`name`</b>: name of the op.
+
+##### Returns:
+
+  A 2D mask tensor, as shown in the example above, cast to specified dtype.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the arguments have invalid rank.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.assign_add.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.assign_add.md
new file mode 100644
index 00000000000..c57e4857d50
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.assign_add.md
@@ -0,0 +1,24 @@
+### `tf.assign_add(ref, value, use_locking=None, name=None)` {#assign_add}
+
+Update 'ref' by adding 'value' to it.
+
+This operation outputs "ref" after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+##### Args:
+
+
+*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+    Should be from a `Variable` node.
+*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
+    The value to be added to the variable.
+*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
+    If True, the addition will be protected by a lock;
+    otherwise the behavior is undefined, but may exhibit less contention.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  Same as "ref".  Returned as a convenience for operations that want
+  to use the new value after the variable has been updated.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md
index b4c74fc9211..64aab3cffb9 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md
@@ -2,13 +2,16 @@
 
 Create an op that decodes the contents of an audio file.
 
+Note that ffmpeg is free to select the "best" audio track from an mp4.
+https://trac.ffmpeg.org/wiki/Map
+
 ##### Args:
 
 
 *  <b>`contents`</b>: The binary contents of the audio file to decode. This is a
       scalar.
 *  <b>`file_format`</b>: A string specifying which format the contents will conform
-      to. This can be mp3, ogg, or wav.
+      to. This can be mp3, mp4, ogg, or wav.
 *  <b>`samples_per_second`</b>: The number of samples per second that is assumed.
       In some cases, resampling will occur to generate the correct sample
       rate.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.space_to_batch.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.space_to_batch.md
index 04b9389c1c2..d83baacdd3f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.space_to_batch.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.space_to_batch.md
@@ -2,6 +2,8 @@
 
 SpaceToBatch for 4-D tensors of type T.
 
+This is a legacy version of the more general SpaceToBatchND.
+
 Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
 More specifically, this op outputs a copy of the input tensor where values from
 the `height` and `width` dimensions are moved to the `batch` dimension. After
diff --git a/tensorflow/g3doc/api_docs/python/image.md b/tensorflow/g3doc/api_docs/python/image.md
index a38f3f6f069..257b2377e30 100644
--- a/tensorflow/g3doc/api_docs/python/image.md
+++ b/tensorflow/g3doc/api_docs/python/image.md
@@ -1227,7 +1227,7 @@ bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
 height of the underlying image.
 
 For example, if an image is 100 x 200 pixels and the bounding box is
-`[0.1, 0.5, 0.2, 0.9]`, the bottom-left and upper-right coordinates of the
+`[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
 bounding box will be `(10, 40)` to `(50, 180)`.
 
 Parts of the bounding box may fall outside the image.
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
index d01afd92bc4..c1b99b4740e 100644
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@@ -95,6 +95,7 @@
   * [`local_variables`](../../api_docs/python/state_ops.md#local_variables)
   * [`make_template`](../../api_docs/python/state_ops.md#make_template)
   * [`min_max_variable_partitioner`](../../api_docs/python/state_ops.md#min_max_variable_partitioner)
+  * [`model_variables`](../../api_docs/python/state_ops.md#model_variables)
   * [`moving_average_variables`](../../api_docs/python/state_ops.md#moving_average_variables)
   * [`no_regularizer`](../../api_docs/python/state_ops.md#no_regularizer)
   * [`ones_initializer`](../../api_docs/python/state_ops.md#ones_initializer)
@@ -121,6 +122,7 @@
 
 * **[Tensor Transformations](../../api_docs/python/array_ops.md)**:
   * [`batch_to_space`](../../api_docs/python/array_ops.md#batch_to_space)
+  * [`batch_to_space_nd`](../../api_docs/python/array_ops.md#batch_to_space_nd)
   * [`bitcast`](../../api_docs/python/array_ops.md#bitcast)
   * [`boolean_mask`](../../api_docs/python/array_ops.md#boolean_mask)
   * [`cast`](../../api_docs/python/array_ops.md#cast)
@@ -137,15 +139,18 @@
   * [`pack`](../../api_docs/python/array_ops.md#pack)
   * [`pad`](../../api_docs/python/array_ops.md#pad)
   * [`rank`](../../api_docs/python/array_ops.md#rank)
+  * [`required_space_to_batch_paddings`](../../api_docs/python/array_ops.md#required_space_to_batch_paddings)
   * [`reshape`](../../api_docs/python/array_ops.md#reshape)
   * [`reverse`](../../api_docs/python/array_ops.md#reverse)
   * [`reverse_sequence`](../../api_docs/python/array_ops.md#reverse_sequence)
   * [`saturate_cast`](../../api_docs/python/array_ops.md#saturate_cast)
+  * [`sequence_mask`](../../api_docs/python/array_ops.md#sequence_mask)
   * [`shape`](../../api_docs/python/array_ops.md#shape)
   * [`shape_n`](../../api_docs/python/array_ops.md#shape_n)
   * [`size`](../../api_docs/python/array_ops.md#size)
   * [`slice`](../../api_docs/python/array_ops.md#slice)
   * [`space_to_batch`](../../api_docs/python/array_ops.md#space_to_batch)
+  * [`space_to_batch_nd`](../../api_docs/python/array_ops.md#space_to_batch_nd)
   * [`space_to_depth`](../../api_docs/python/array_ops.md#space_to_depth)
   * [`split`](../../api_docs/python/array_ops.md#split)
   * [`squeeze`](../../api_docs/python/array_ops.md#squeeze)
@@ -188,6 +193,7 @@
   * [`digamma`](../../api_docs/python/math_ops.md#digamma)
   * [`div`](../../api_docs/python/math_ops.md#div)
   * [`edit_distance`](../../api_docs/python/math_ops.md#edit_distance)
+  * [`einsum`](../../api_docs/python/math_ops.md#einsum)
   * [`erf`](../../api_docs/python/math_ops.md#erf)
   * [`erfc`](../../api_docs/python/math_ops.md#erfc)
   * [`exp`](../../api_docs/python/math_ops.md#exp)
@@ -249,7 +255,6 @@
   * [`sin`](../../api_docs/python/math_ops.md#sin)
   * [`sparse_segment_mean`](../../api_docs/python/math_ops.md#sparse_segment_mean)
   * [`sparse_segment_sqrt_n`](../../api_docs/python/math_ops.md#sparse_segment_sqrt_n)
-  * [`sparse_segment_sqrt_n_grad`](../../api_docs/python/math_ops.md#sparse_segment_sqrt_n_grad)
   * [`sparse_segment_sum`](../../api_docs/python/math_ops.md#sparse_segment_sum)
   * [`sqrt`](../../api_docs/python/math_ops.md#sqrt)
   * [`square`](../../api_docs/python/math_ops.md#square)
@@ -657,6 +662,15 @@
   * [`ELBOForms`](../../api_docs/python/contrib.bayesflow.variational_inference.md#ELBOForms)
   * [`register_prior`](../../api_docs/python/contrib.bayesflow.variational_inference.md#register_prior)
 
+* **[CRF (contrib)](../../api_docs/python/contrib.crf.md)**:
+  * [`crf_binary_score`](../../api_docs/python/contrib.crf.md#crf_binary_score)
+  * [`crf_log_likelihood`](../../api_docs/python/contrib.crf.md#crf_log_likelihood)
+  * [`crf_log_norm`](../../api_docs/python/contrib.crf.md#crf_log_norm)
+  * [`crf_sequence_score`](../../api_docs/python/contrib.crf.md#crf_sequence_score)
+  * [`crf_unary_score`](../../api_docs/python/contrib.crf.md#crf_unary_score)
+  * [`CrfForwardRnnCell`](../../api_docs/python/contrib.crf.md#CrfForwardRnnCell)
+  * [`viterbi_decode`](../../api_docs/python/contrib.crf.md#viterbi_decode)
+
 * **[Statistical distributions (contrib)](../../api_docs/python/contrib.distributions.md)**:
   * [`BaseDistribution`](../../api_docs/python/contrib.distributions.md#BaseDistribution)
   * [`Bernoulli`](../../api_docs/python/contrib.distributions.md#Bernoulli)
@@ -943,6 +957,7 @@
   * [`streaming_recall_at_k`](../../api_docs/python/contrib.metrics.md#streaming_recall_at_k)
   * [`streaming_root_mean_squared_error`](../../api_docs/python/contrib.metrics.md#streaming_root_mean_squared_error)
   * [`streaming_sensitivity_at_specificity`](../../api_docs/python/contrib.metrics.md#streaming_sensitivity_at_specificity)
+  * [`streaming_sparse_average_precision_at_k`](../../api_docs/python/contrib.metrics.md#streaming_sparse_average_precision_at_k)
   * [`streaming_sparse_precision_at_k`](../../api_docs/python/contrib.metrics.md#streaming_sparse_precision_at_k)
   * [`streaming_sparse_recall_at_k`](../../api_docs/python/contrib.metrics.md#streaming_sparse_recall_at_k)
   * [`streaming_specificity_at_sensitivity`](../../api_docs/python/contrib.metrics.md#streaming_specificity_at_sensitivity)
diff --git a/tensorflow/g3doc/api_docs/python/math_ops.md b/tensorflow/g3doc/api_docs/python/math_ops.md
index 2707a91a59c..32df4d2e67b 100644
--- a/tensorflow/g3doc/api_docs/python/math_ops.md
+++ b/tensorflow/g3doc/api_docs/python/math_ops.md
@@ -78,6 +78,32 @@ Returns x * y element-wise.
   A `Tensor`. Has the same type as `x`.
 
 
+- - -
+
+### `tf.scalar_mul(scalar, x)` {#scalar_mul}
+
+Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
+
+Intended for use in gradient code which might deal with `IndexedSlices`
+objects, which are easy to multiply by a scalar but more expensive to
+multiply with arbitrary tensors.
+
+##### Args:
+
+
+*  <b>`scalar`</b>: A 0-D scalar `Tensor`. Must have known shape.
+*  <b>`x`</b>: A `Tensor` or `IndexedSlices` to be scaled.
+
+##### Returns:
+
+  `scalar * x` of the same type (`Tensor` or `IndexedSlices`) as `x`.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if scalar is not a 0-D `scalar`.
+
+
 - - -
 
 ### `tf.div(x, y, name=None)` {#div}
@@ -2345,6 +2371,9 @@ Returns the element-wise sum of a list of tensors.
 Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
 otherwise, these are inferred.
 
+NOTE: This operation is not differentiable and cannot be used if inputs depend
+on trainable variables. Please use tf.add_n for such cases.
+
 For example:
 
 ```python
@@ -2377,6 +2406,16 @@ tf.accumulate_n([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
 
 
 
+- - -
+
+### `tf.einsum(axes, *inputs)` {#einsum}
+
+A generalized contraction between tensors of arbitrary dimension.
+
+Like numpy.einsum.
+
+
+
 ## Scan
 
 TensorFlow provides several operations that you can use to perform scans
@@ -3103,58 +3142,3 @@ invert_permutation(x) ==> [2, 4, 3, 0, 1]
   A `Tensor`. Has the same type as `x`. 1-D.
 
 
-
-## Other Functions and Classes
-- - -
-
-### `tf.scalar_mul(scalar, x)` {#scalar_mul}
-
-Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
-
-Intended for use in gradient code which might deal with `IndexedSlices`
-objects, which are easy to multiply by a scalar but more expensive to
-multiply with arbitrary tensors.
-
-##### Args:
-
-
-*  <b>`scalar`</b>: A 0-D scalar `Tensor`. Must have known shape.
-*  <b>`x`</b>: A `Tensor` or `IndexedSlices` to be scaled.
-
-##### Returns:
-
-  `scalar * x` of the same type (`Tensor` or `IndexedSlices`) as `x`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if scalar is not a 0-D `scalar`.
-
-
-- - -
-
-### `tf.sparse_segment_sqrt_n_grad(grad, indices, segment_ids, output_dim0, name=None)` {#sparse_segment_sqrt_n_grad}
-
-Computes gradients for SparseSegmentSqrtN.
-
-Returns tensor "output" with same shape as grad, except for dimension 0 whose
-value is output_dim0.
-
-##### Args:
-
-
-*  <b>`grad`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`.
-    gradient propagated to the SparseSegmentSqrtN op.
-*  <b>`indices`</b>: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-    indices passed to the corresponding SparseSegmentSqrtN op.
-*  <b>`segment_ids`</b>: A `Tensor` of type `int32`.
-    segment_ids passed to the corresponding SparseSegmentSqrtN op.
-*  <b>`output_dim0`</b>: A `Tensor` of type `int32`.
-    dimension 0 of "data" passed to SparseSegmentSqrtN op.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `grad`.
-
-
diff --git a/tensorflow/g3doc/api_docs/python/nn.md b/tensorflow/g3doc/api_docs/python/nn.md
index f6873f51d2b..abd4b27d5cb 100644
--- a/tensorflow/g3doc/api_docs/python/nn.md
+++ b/tensorflow/g3doc/api_docs/python/nn.md
@@ -1451,7 +1451,7 @@ equivalent formulation
 
 ### `tf.nn.softmax(logits, dim=-1, name=None)` {#softmax}
 
-Computes softmax activations.
+Computes log softmax activations.
 
 For each batch `i` and class `j` we have
 
@@ -1485,7 +1485,7 @@ Computes log softmax activations.
 
 For each batch `i` and class `j` we have
 
-    logsoftmax = logits - log(reduce_sum(exp(logits), dim))
+    logsoftmax = logits - reduce_sum(exp(logits), dim)
 
 ##### Args:
 
@@ -1572,16 +1572,16 @@ output of `softmax`, as it will produce incorrect results.
 A common use case is to have logits of shape `[batch_size, num_classes]` and
 labels of shape `[batch_size]`. But higher dimensions are supported.
 
-##### Args:
+Args:
 
-
-*  <b>`logits`</b>: Unscaled log probabilities of rank `r` and shape
+  logits: Unscaled log probabilities of rank `r` and shape
     `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
-*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
+  labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
     `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
-    Other values will result in a loss of 0, but incorrect gradient
-    computations.
-*  <b>`name`</b>: A name for the operation (optional).
+    Other values will raise an exception when this op is run on CPU, and
+    return `NaN` for corresponding corresponding loss and gradient rows
+    on GPU.
+  name: A name for the operation (optional).
 
 ##### Returns:
 
@@ -1897,7 +1897,7 @@ for correctness than performance, unlike in rnn().
 Creates a recurrent neural network specified by RNNCell `cell`.
 
 The simplest form of RNN network generated is:
-```py
+```python
   state = cell.zero_state(...)
   outputs = []
   for input_ in inputs:
@@ -1914,11 +1914,13 @@ sequence length of the minibatch (thus saving computational time),
 and properly propagates the state at an example's sequence length
 to the final state output.
 
-The dynamic calculation performed is, at time t for batch row b,
+The dynamic calculation performed is, at time `t` for batch row `b`,
+```python
   (output, state)(b, t) =
     (t >= sequence_length(b))
       ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
       : cell(input(b, t), state(b, t - 1))
+```
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/script_ops.md b/tensorflow/g3doc/api_docs/python/script_ops.md
index d952a02112d..2002f11b770 100644
--- a/tensorflow/g3doc/api_docs/python/script_ops.md
+++ b/tensorflow/g3doc/api_docs/python/script_ops.md
@@ -12,7 +12,6 @@ Note: Functions taking `Tensor` arguments can also take anything accepted by
 TensorFlow provides allows you to wrap python/numpy functions as
 TensorFlow operators.
 
-## Other Functions and Classes
 - - -
 
 ### `tf.py_func(func, inp, Tout, stateful=True, name=None)` {#py_func}
@@ -38,8 +37,8 @@ sinh(x) as an op in the graph.
 
 *  <b>`func`</b>: A python function.
 *  <b>`inp`</b>: A list of `Tensor`.
-*  <b>`Tout`</b>: A list of tensorflow data types or a single tensorflow data type
-        indicating what `func` returns.
+*  <b>`Tout`</b>: A list or tuple of tensorflow data types or a single tensorflow data
+        type if there is only one, indicating what `func` returns.
 *  <b>`stateful`</b>: A boolean indicating whether the function should be considered
             stateful or stateless. I.e. whether it, given the same input, will
             return the same output and at the same time does not change state
diff --git a/tensorflow/g3doc/api_docs/python/sparse_ops.md b/tensorflow/g3doc/api_docs/python/sparse_ops.md
index 51863c646c3..bab032b2061 100644
--- a/tensorflow/g3doc/api_docs/python/sparse_ops.md
+++ b/tensorflow/g3doc/api_docs/python/sparse_ops.md
@@ -640,7 +640,8 @@ Graphically this is equivalent to doing
 ##### Args:
 
 
-*  <b>`concat_dim`</b>: Dimension to concatenate along.
+*  <b>`concat_dim`</b>: Dimension to concatenate along. Must be in range [-rank, rank),
+    where rank is the number of dimensions in each input `SparseTensor`.
 *  <b>`sp_inputs`</b>: List of `SparseTensor` to concatenate.
 *  <b>`name`</b>: A name prefix for the returned tensors (optional).
 *  <b>`expand_nonconcat_dim`</b>: Whether to allow the expansion in the non-concat
diff --git a/tensorflow/g3doc/api_docs/python/state_ops.md b/tensorflow/g3doc/api_docs/python/state_ops.md
index 7df7901cd1b..727481f24d0 100644
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ b/tensorflow/g3doc/api_docs/python/state_ops.md
@@ -1208,6 +1208,17 @@ Returns all variables created with collection=[LOCAL_VARIABLES].
   A list of local Variable objects.
 
 
+- - -
+
+### `tf.model_variables()` {#model_variables}
+
+Returns all variables in the MODEL_VARIABLES collection.
+
+##### Returns:
+
+  A list of local Variable objects.
+
+
 - - -
 
 ### `tf.moving_average_variables()` {#moving_average_variables}
@@ -1345,6 +1356,92 @@ logged by the C++ runtime. This is expected.
 
 
 
+- - -
+
+### `tf.assign(ref, value, validate_shape=None, use_locking=None, name=None)` {#assign}
+
+Update 'ref' by assigning 'value' to it.
+
+This operation outputs "ref" after the assignment is done.
+This makes it easier to chain operations that need to use the reset value.
+
+##### Args:
+
+
+*  <b>`ref`</b>: A mutable `Tensor`.
+    Should be from a `Variable` node. May be uninitialized.
+*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
+    The value to be assigned to the variable.
+*  <b>`validate_shape`</b>: An optional `bool`. Defaults to `True`.
+    If true, the operation will validate that the shape
+    of 'value' matches the shape of the Tensor being assigned to.  If false,
+    'ref' will take on the shape of 'value'.
+*  <b>`use_locking`</b>: An optional `bool`. Defaults to `True`.
+    If True, the assignment will be protected by a lock;
+    otherwise the behavior is undefined, but may exhibit less contention.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  Same as "ref".  Returned as a convenience for operations that want
+  to use the new value after the variable has been reset.
+
+
+- - -
+
+### `tf.assign_add(ref, value, use_locking=None, name=None)` {#assign_add}
+
+Update 'ref' by adding 'value' to it.
+
+This operation outputs "ref" after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+##### Args:
+
+
+*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+    Should be from a `Variable` node.
+*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
+    The value to be added to the variable.
+*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
+    If True, the addition will be protected by a lock;
+    otherwise the behavior is undefined, but may exhibit less contention.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  Same as "ref".  Returned as a convenience for operations that want
+  to use the new value after the variable has been updated.
+
+
+- - -
+
+### `tf.assign_sub(ref, value, use_locking=None, name=None)` {#assign_sub}
+
+Update 'ref' by subtracting 'value' from it.
+
+This operation outputs "ref" after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+##### Args:
+
+
+*  <b>`ref`</b>: A mutable `Tensor`. Must be one of the following types: `float32`, `float64`, `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+    Should be from a `Variable` node.
+*  <b>`value`</b>: A `Tensor`. Must have the same type as `ref`.
+    The value to be subtracted to the variable.
+*  <b>`use_locking`</b>: An optional `bool`. Defaults to `False`.
+    If True, the subtraction will be protected by a lock;
+    otherwise the behavior is undefined, but may exhibit less contention.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  Same as "ref".  Returned as a convenience for operations that want
+  to use the new value after the variable has been updated.
+
+
+
 ## Saving and Restoring Variables
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
index 4c8b62b7d76..0920c41da6f 100644
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@@ -3520,6 +3520,13 @@ Adds a metadata information for a single session.run() call.
 *  <b>`ValueError`</b>: If the provided tag was already used for this type of event.
 
 
+- - -
+
+#### `tf.train.SummaryWriter.get_logdir()` {#SummaryWriter.get_logdir}
+
+Returns the directory where event file will be written.
+
+
 
 - - -
 
diff --git a/tensorflow/g3doc/how_tos/hadoop/index.md b/tensorflow/g3doc/how_tos/hadoop/index.md
new file mode 100644
index 00000000000..e98299f72d2
--- /dev/null
+++ b/tensorflow/g3doc/how_tos/hadoop/index.md
@@ -0,0 +1,51 @@
+# How to run TensorFlow on Hadoop
+
+This document describes how to run TensorFlow on Hadoop. It will be expanded to
+describe running on various cluster managers, but only describes running on HDFS
+at the moment.
+
+## HDFS
+
+We assume that you are familiar with [reading data](../reading_data/index.md).
+
+To use HDFS with TensorFlow, change the file paths you use to read and write
+data to an HDFS path. For example:
+
+```python
+filename_queue = tf.train.string_input_producer([
+    "hdfs://namenode:8020/path/to/file1.csv",
+    "hdfs://namenode:8020/path/to/file2.csv",
+])
+```
+
+If you want to use the namenode specified in your HDFS configuration files, then
+change the file prefix to `hdfs://default/`.
+
+When launching your TensorFlow program, the following environment variables must
+be set:
+
+*   **JAVA_HOME**: The location of your Java installation.
+*   **HADOOP_HDFS_HOME**: The location of your HDFS installation. You can also
+    set this environment variable by running:
+
+```shell
+source $HADOOP_HOME/libexec/hadoop-config.sh
+```
+
+*   **LD_LIBRARY_PATH**: To include the path to libjvm.so. On Linux:
+
+```shell
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$JAVA_HOME/jre/lib/amd64/server
+```
+
+*   **CLASSPATH**: The Hadoop jars must be added prior to running your
+    TensorFlow program. The CLASSPATH set by
+    `$HADOOP_HOME/libexec/hadoop-config.sh` is insufficient. Globs must be
+    expanded as described in the libhdfs documentation:
+
+```shell
+CLASSPATH=$($HADOOP_HDFS_HOME/bin/hdfs classpath --glob) python your_script.py
+```
+
+If you are running [Distributed TensorFlow](../distributed/index.md), then all
+workers must have the environment variables set and Hadoop installed.
diff --git a/tensorflow/g3doc/how_tos/index.md b/tensorflow/g3doc/how_tos/index.md
index c1f92b785e2..c049bdbda50 100644
--- a/tensorflow/g3doc/how_tos/index.md
+++ b/tensorflow/g3doc/how_tos/index.md
@@ -142,3 +142,10 @@ quantized parameters and calculations. It also describes how the quantization
 process works under the hood.
 
 [View Tutorial](../how_tos/quantization/index.md)
+
+## How to run TensorFlow on Hadoop
+
+This tutorial shows how to read and write HDFS files, and will later describe
+running on cluster managers.
+
+[View Tutorial](../how_tos/hadoop/index.md)
diff --git a/tensorflow/g3doc/how_tos/leftnav_files b/tensorflow/g3doc/how_tos/leftnav_files
index 9371098b0be..a88398265f2 100644
--- a/tensorflow/g3doc/how_tos/leftnav_files
+++ b/tensorflow/g3doc/how_tos/leftnav_files
@@ -8,4 +8,5 @@ distributed/index.md
 adding_an_op/index.md
 new_data_formats/index.md
 using_gpu/index.md
-variable_scope/index.md
\ No newline at end of file
+variable_scope/index.md
+hadoop/index.md
diff --git a/tensorflow/g3doc/tutorials/mnist/beginners/index.md b/tensorflow/g3doc/tutorials/mnist/beginners/index.md
index 08baf7e70d4..7de6981ba16 100644
--- a/tensorflow/g3doc/tutorials/mnist/beginners/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/beginners/index.md
@@ -343,6 +343,14 @@ each element of `y_` with the corresponding element of `tf.log(y)`. Then
 `reduction_indices=[1]` parameter. Finally, `tf.reduce_mean` computes the mean
 over all the examples in the batch.
 
+(Note that in the source code, we don't use this formulation, because it is
+numerically unstable.  Instead, we apply
+`tf.nn.softmax_cross_entropy_with_logits` on the unnormalized logits (e.g., we
+call `softmax_cross_entropy_with_logits` on `tf.matmul(x, W) + b`), because this
+more numerically stable function internally computes the softmax activation.  In
+your code, consider using tf.nn.(sparse_)softmax_cross_entropy_with_logits
+instead).
+
 Now that we know what we want our model to do, it's very easy to have TensorFlow
 train it to do so.  Because TensorFlow knows the entire graph of your
 computations, it can automatically use the
diff --git a/tensorflow/g3doc/tutorials/mnist/pros/index.md b/tensorflow/g3doc/tutorials/mnist/pros/index.md
index d3a0af6e652..d77e70c8950 100644
--- a/tensorflow/g3doc/tutorials/mnist/pros/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/pros/index.md
@@ -160,24 +160,25 @@ sess.run(tf.initialize_all_variables())
 ### Predicted Class and Loss Function
 
 We can now implement our regression model. It only takes one line!  We multiply
-the vectorized input images `x` by the weight matrix `W`, add the bias `b`, and
-compute the softmax probabilities that are assigned to each class.
+the vectorized input images `x` by the weight matrix `W`, add the bias `b`.
 
 ```python
-y = tf.nn.softmax(tf.matmul(x,W) + b)
+y = tf.matmul(x,W) + b
 ```
 
 We can specify a loss function just as easily. Loss indicates how bad the
 model's prediction was on a single example; we try to minimize that while
 training across all the examples. Here, our loss function is the cross-entropy
-between the target and the model's prediction:
+between the target and the softmax activation function applied to the model's
+prediction.  As in the beginners tutorial, we use the stable formulation:
 
 ```python
-cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
+cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
 ```
 
-Note that `tf.reduce_sum` sums across all classes and `tf.reduce_mean` takes 
-the average over these sums.
+Note that `tf.nn.softmax_cross_entropy_with_logits` internally applies the
+softmax on the model's unnormalized model prediction and sums across all
+classes, and `tf.reduce_mean` takes the average over these sums.
 
 ## Train the Model
 
@@ -364,14 +365,14 @@ h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
 
 ### Readout Layer
 
-Finally, we add a softmax layer, just like for the one layer softmax regression
+Finally, we add a layer, just like for the one layer softmax regression
 above.
 
 ```python
 W_fc2 = weight_variable([1024, 10])
 b_fc2 = bias_variable([10])
 
-y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
+y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
 ```
 
 ### Train and Evaluate the Model
@@ -393,7 +394,7 @@ Feel free to go ahead and run this code, but it does 20,000 training iterations
 and may take a while (possibly up to half an hour), depending on your processor.
 
 ```python
-cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[1]))
+cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
 train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
 correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
 accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 01996d67ae1..6a1a04191a7 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -81,10 +81,6 @@ tf_py_test(
         ":platform",
         ":platform_test",
     ],
-    tags = [
-        "manual",
-        "notap",
-    ],
 )
 
 tf_py_test(
@@ -1566,7 +1562,10 @@ py_library(
     name = "util",
     srcs = glob(
         ["util/**/*.py"],
-        exclude = ["util/example_parser*"],
+        exclude = [
+            "util/example_parser*",
+            "util/**/*_test.py",
+        ],
     ),
     srcs_version = "PY2AND3",
     deps = ["@protobuf//:protobuf_python"],
@@ -1584,6 +1583,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "future_api_test",
+    size = "small",
+    srcs = [
+        "util/future_api_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_library(
     name = "util_example_parser_configuration",
     srcs = ["util/example_parser_configuration.py"],
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 03f6d5a6ed4..e2786bde359 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -142,31 +142,23 @@ _allowed_symbols = [
     'RunMetadata',
     'SessionLog',
     'Summary',
+    'initialize_all_tables',
+]
+
+# The following symbols are kept for compatibility. It is our plan
+# to remove them in the future.
+_allowed_symbols.extend([
     'arg_max',
     'arg_min',
-    'assign',
-    'assign_add',
-    'assign_sub',
-    'bitcast',
-    'bytes',
-    'compat',
     'create_partitioned_variables',
     'deserialize_many_sparse',
-    'initialize_all_tables',
     'lin_space',
-    'list_diff',
+    'list_diff',  # Use tf.listdiff instead.
     'parse_single_sequence_example',
-    'py_func',
-    'scalar_mul',
     'serialize_many_sparse',
     'serialize_sparse',
-    'shape_n',
-    'sparse_matmul',
-    'sparse_segment_mean_grad',
-    'sparse_segment_sqrt_n_grad',
-    'unique_with_counts',
-    'user_ops',
-]
+    'sparse_matmul',   ## use tf.matmul instead.
+])
 
 # This is needed temporarily because we import it explicitly.
 _allowed_symbols.extend([
@@ -225,6 +217,7 @@ _allowed_symbols.extend([
 # Export modules and constants.
 _allowed_symbols.extend([
     'app',
+    'compat',
     'errors',
     'flags',
     'gfile',
@@ -238,6 +231,7 @@ _allowed_symbols.extend([
     'sysconfig',
     'test',
     'train',
+    'user_ops',
 ])
 
 # Variables framework.versions:
@@ -251,7 +245,7 @@ _allowed_symbols.extend([
 # referenced in the whitelist.
 remove_undocumented(__name__, _allowed_symbols,
                     [framework_lib, array_ops, client_lib, check_ops,
-                     constant_op, control_flow_ops, functional_ops,
+                     compat, constant_op, control_flow_ops, functional_ops,
                      histogram_ops, io_ops, math_ops, nn, script_ops,
                      session_ops, sparse_ops, state_ops, string_ops,
                      summary, tensor_array_ops, train])
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 6081097602a..197a73d0fda 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -299,28 +299,33 @@ class _DefinedFunction(object):
                input_types,
                func_name=None,
                grad_func=None,
-               python_grad_func=None):
+               python_grad_func=None,
+               **kwargs):
     """Creates _DefinedFunction.
 
     Args:
       func:  A python callable which constructs a tf function body.
       input_types: The function's argument types. Can be a tuple, list of
-        tf data types, or a dictionary of argument names to their types.
+        tf data types.
       func_name: The function name. Defaults to None, in which derives from
         'func'.
       grad_func: This function's gradient function, if not None. Defaults
         to None.
       python_grad_func: A python callable implementing the gradient of
         the function python-side.
+      **kwargs: The keyword arguments. **kwargs is passed to every call
+        site of this function.
 
     Raises:
       ValueError: The function definition is invalid.
+
     """
     self._func = func
     self._input_types = input_types
     self._func_name = func_name or _get_func_name(func)
     self._grad_func = grad_func
     self._python_grad_func = python_grad_func
+    self._extra_kwargs = kwargs
     self._definition = None  # Constructed lazily.
 
     argspec = inspect.getargspec(func)
@@ -344,16 +349,11 @@ class _DefinedFunction(object):
       argnames = argspec.args[1:]
 
     self._args = []
-    if isinstance(input_types, (list, tuple)):
-      for i in range(len(input_types)):
-        argname = argnames[i] if i < len(argnames) else ("arg%d" % i)
-        argtype = input_types[i]
-        self._args.append((argname, argtype))
-    else:
-      for name in argnames:
-        if name not in input_types:
-          raise ValueError("Missing type for argument: " + name)
-        self._args.append((name, input_types[name]))
+    assert isinstance(input_types, (list, tuple))
+    for i in range(len(input_types)):
+      argname = argnames[i] if i < len(argnames) else ("arg%d" % i)
+      argtype = input_types[i]
+      self._args.append((argname, argtype))
 
   @property
   def name(self):
@@ -387,17 +387,11 @@ class _DefinedFunction(object):
     with temp_graph.as_default():
       # List of placeholders for the function_def.
       inputs = []
-      # Arglist to call 'func'
-      kwargs = {}
       for (argname, argtype) in self._args:
         argholder = array_ops.placeholder(argtype, name=argname)
         inputs.append(argholder)
-        kwargs[argname] = argholder
       # Call func and gather the output tensors.
-      if isinstance(self._input_types, (list, tuple)):
-        outputs = self._func(*inputs)
-      else:
-        outputs = self._func(**kwargs)
+      outputs = self._func(*inputs)
       if not isinstance(outputs, ops.Tensor) and not outputs:
         raise ValueError("Function must return at least one tensor")
       # Convenience: if func only returned one value, make it a tuple.
@@ -442,6 +436,10 @@ class _DefinedFunction(object):
 
   def __call__(self, *args, **kwargs):
     self.add_to_graph(ops.get_default_graph())
+    if self._extra_kwargs:
+      for k in self._extra_kwargs:
+        if k not in kwargs:
+          kwargs[k] = self._extra_kwargs[k]
     return _call(self._definition.signature, *args, **kwargs)
 
 
@@ -486,16 +484,12 @@ class Defun(object):
 
   """
 
-  def __init__(self, *input_type_list, **input_types):
+  def __init__(self, *input_type_list, **kwargs):
     """Create a `Defun` decorator.
 
     Args:
       *input_type_list: A list of `tf.DType`
-      **input_types: Dict mapping string with `tf.DType`
-        One key for each argument of the function to decorate.
-
-       Note that these optional keyword arguments are also accepted:
-
+      **kwargs: Optional keyword arguments, including
          func_name - (optional).  A python string, the name to use to
            declare this `Function` in the graph.
 
@@ -513,23 +507,17 @@ class Defun(object):
            This will be called by tf.gradients to add the gradient ops
            to the graph. At most one of grad_func and python_grad_func
            can be specified.
-
     """
-    self._func_name = input_types.pop("func_name", None)
-    self._grad_func = input_types.pop("grad_func", None)
-    self._python_grad_func = input_types.pop("python_grad_func", None)
-    assert not input_type_list or not input_types, (
-        "Can't specify both *input_type_list and **input_types")
-    self._input_types = input_types
     self._input_type_list = input_type_list
+    self._func_name = kwargs.pop("func_name", None)
+    self._grad_func = kwargs.pop("grad_func", None)
+    self._python_grad_func = kwargs.pop("python_grad_func", None)
+    self._extra_kwargs = kwargs
 
   def __call__(self, f):
-    if self._input_types:
-      inp_types = self._input_types
-    else:
-      inp_types = self._input_type_list
-    return _DefinedFunction(f, inp_types, self._func_name, self._grad_func,
-                            self._python_grad_func)
+    return _DefinedFunction(f, self._input_type_list, self._func_name,
+                            self._grad_func, self._python_grad_func,
+                            **self._extra_kwargs)
 
 
 class Declare(object):
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 0260fbe03e4..4de570e9b7c 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -317,25 +317,18 @@ class FunctionTest(tf.test.TestCase):
         _ = PlusMinusV1.definition
       with self.assertRaisesRegexp(ValueError, "specified input types"):
 
-        @function.Defun(c=tf.float32)
+        @function.Defun(tf.float32)
         def PlusMinusV2(a, b):
           return a + b, b - a
 
         _ = PlusMinusV2.definition
-      with self.assertRaisesRegexp(ValueError, "type for argument: b"):
+      with self.assertRaisesRegexp(ValueError, "specified input types"):
 
-        @function.Defun(a=tf.float32, c=tf.float32)
+        @function.Defun(tf.float32, tf.float32, tf.float32)
         def PlusMinusV3(a, b):
           return a + b, b - a
 
         _ = PlusMinusV3.definition
-      with self.assertRaisesRegexp(ValueError, "specified input types"):
-
-        @function.Defun(a=tf.float32, b=tf.float32, c=tf.float32)
-        def PlusMinusV4(a, b):
-          return a + b, b - a
-
-        _ = PlusMinusV4.definition
 
   def testCallErrors(self):
 
@@ -581,8 +574,7 @@ class UnrollLSTMTest(tf.test.TestCase):
       return Loop(cell, weights, inp)
 
     cell = function.Defun(
-        x=tf.float32, mprev=tf.float32, cprev=tf.float32,
-        weights=tf.float32)(cell)
+        tf.float32, tf.float32, tf.float32, tf.float32)(cell)
     if mode == "cell":
       # Just represent the LSTM as a function.
       return Loop(cell, weights, inp)
@@ -687,7 +679,8 @@ class FunctionInlineControlTest(tf.test.TestCase):
             do_constant_folding=True)))
     for noinline in [False, True]:
 
-      @function.Defun(dtype)
+      # pylint: disable=unexpected-keyword-arg
+      @function.Defun(dtype, noinline=noinline)
       def Cell(v):
         # If v is a vector [n, 1], x is a big square matrix.
         x = tf.tanh(v + tf.transpose(v, [1, 0]))
@@ -696,9 +689,8 @@ class FunctionInlineControlTest(tf.test.TestCase):
       @function.Defun(dtype)
       def Forward(x):
         for _ in range(10):
-          # pylint: disable=unexpected-keyword-arg
           # pylint: disable=cell-var-from-loop
-          x = Cell(x, noinline=noinline)
+          x = Cell(x)
         return tf.reduce_sum(x, [0, 1])
 
       g = tf.Graph()
diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py
index 79bcba4ef88..3feb80b7a8d 100644
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ b/tensorflow/python/framework/gen_docs_combined.py
@@ -61,6 +61,7 @@ def module_names():
       "tf.contrib.bayesflow.stochastic_tensor",
       "tf.contrib.bayesflow.variational_inference",
       "tf.contrib.copy_graph",
+      "tf.contrib.crf",
       "tf.contrib.distributions",
       "tf.contrib.ffmpeg",
       "tf.contrib.framework",
@@ -208,6 +209,7 @@ def all_libraries(module_to_name, members, documented):
       library("contrib.bayesflow.variational_inference",
               "BayesFlow Variational Inference (contrib)",
               tf.contrib.bayesflow.variational_inference),
+      library("contrib.crf", "CRF (contrib)", tf.contrib.crf),
       library("contrib.distributions", "Statistical distributions (contrib)",
               tf.contrib.distributions),
       library("contrib.ffmpeg", "FFmpeg (contrib)", ffmpeg),
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 5ac221318eb..ac6447ea3d3 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -231,8 +231,14 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
   else:
     producer_op_dict = {op.name: op for op in producer_op_list.op}
 
+  # LINT.IfChange
   with ops.name_scope(name, 'import', input_map.values()) as scope:
     g = ops.get_default_graph()
+    # TODO(ashankar): Should this just copy over or should it do some
+    # more nuanced merging? For example, the graph may already have some
+    # marked "bad versions" and we don't want to lose those because of
+    # what's in graph_def.versions? The C++ ImporGraphDef does something
+    # more nuanced.
     g.graph_def_versions.CopyFrom(graph_def.versions)
 
     if input_map:
@@ -444,3 +450,4 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
             raise ValueError(
                 'Requested return_element %r not found in graph_def.' % name)
       return ret
+  # LINT.ThenChange(https://www.tensorflow.org/code/tensorflow/core/graph/graph_constructor.cc)
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 8e4a03eeea1..9d8d0d1fc68 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -535,7 +535,8 @@ class TensorFlowTestCase(googletest.TestCase):
       self.fail(exception_type.__name__ + " not raised")
     except Exception as e:  # pylint: disable=broad-except
       if not isinstance(e, exception_type) or not predicate(e):
-        raise AssertionError(e)
+        raise AssertionError("Exception of type %s: %s" %
+                             (str(type(e)), str(e)))
   # pylint: enable=g-doc-return-or-yield
 
   def assertRaisesOpError(self, expected_err_re_or_predicate):
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 5a4fbe178c0..325acdf8d4b 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -744,5 +744,29 @@ class ShapeSizeRankTest(test_util.TensorFlowTestCase):
       self.assertEqual(2, tf.rank(sp).eval())
 
 
+class SequenceMaskTest(test_util.TensorFlowTestCase):
+
+  def testExceptions(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "lengths must be 1D"):
+        tf.sequence_mask([[10, 20]], [10, 20])
+      with self.assertRaisesRegexp(ValueError, "maxlen must be scalar"):
+        tf.sequence_mask([10, 20], [10, 20])
+
+  def testNormal(self):
+    with self.test_session():
+      res = tf.sequence_mask(tf.constant([1, 3, 2]), 5)
+      self.assertAllEqual(res.get_shape(), [3, 5])
+      self.assertAllEqual(res.eval(), [[True, False, False, False, False],
+                                       [True, True, True, False, False],
+                                       [True, True, False, False, False]])
+
+      # test dtype and default maxlen:
+      res = tf.sequence_mask(tf.constant([0, 1, 4]), dtype=tf.float32)
+      self.assertAllEqual(res.get_shape().as_list(), [3, None])
+      self.assertAllEqual(res.eval(), [[0.0, 0.0, 0.0, 0.0],
+                                       [1.0, 0.0, 0.0, 0.0],
+                                       [1.0, 1.0, 1.0, 1.0]])
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/batchtospace_op_test.py b/tensorflow/python/kernel_tests/batchtospace_op_test.py
index c6e7f3cbd13..1b9b2da83f3 100644
--- a/tensorflow/python/kernel_tests/batchtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/batchtospace_op_test.py
@@ -26,24 +26,44 @@ from __future__ import print_function
 import numpy as np
 import tensorflow as tf
 
+from tensorflow.python.ops import gen_array_ops
 
-class BatchToSpaceDepthToSpace(tf.test.TestCase):
+
+class PythonOpImpl(object):
+
+  @staticmethod
+  def batch_to_space(*args, **kwargs):
+    return tf.batch_to_space(*args, **kwargs)
+
+
+class CppOpImpl(object):
+
+  @staticmethod
+  def batch_to_space(*args, **kwargs):
+    return gen_array_ops._batch_to_space(*args, **kwargs)
+
+
+class BatchToSpaceDepthToSpace(tf.test.TestCase, PythonOpImpl):
 
   # Verifies that: batch_to_space(x) = transpose(depth_to_space(transpose(x)))
   def testDepthToSpaceTranspose(self):
     x = np.arange(20 * 5 * 8 * 7, dtype=np.float32).reshape([20, 5, 8, 7])
     block_size = 2
     crops = np.zeros((2, 2), dtype=np.int32)
-    y1 = tf.batch_to_space(x, crops, block_size=block_size)
+    y1 = self.batch_to_space(x, crops, block_size=block_size)
     y2 = tf.transpose(
         tf.depth_to_space(
-            tf.transpose(x, [3, 1, 2, 0]), block_size=block_size),
-        [3, 1, 2, 0])
+            tf.transpose(x, [3, 1, 2, 0]),
+            block_size=block_size), [3, 1, 2, 0])
     with self.test_session():
       self.assertAllEqual(y1.eval(), y2.eval())
 
 
-class BatchToSpaceErrorHandlingTest(tf.test.TestCase):
+class BatchToSpaceDepthToSpaceCpp(BatchToSpaceDepthToSpace, CppOpImpl):
+  pass
+
+
+class BatchToSpaceErrorHandlingTest(tf.test.TestCase, PythonOpImpl):
 
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
@@ -51,7 +71,7 @@ class BatchToSpaceErrorHandlingTest(tf.test.TestCase):
     crops = np.zeros((2, 2), dtype=np.int32)
     block_size = 2
     with self.assertRaises(ValueError):
-      _ = tf.batch_to_space(x_np, crops, block_size)
+      _ = self.batch_to_space(x_np, crops, block_size)
 
   def testBlockSize0(self):
     # The block size is 0.
@@ -59,7 +79,7 @@ class BatchToSpaceErrorHandlingTest(tf.test.TestCase):
     crops = np.zeros((2, 2), dtype=np.int32)
     block_size = 0
     with self.assertRaises(ValueError):
-      out_tf = tf.batch_to_space(x_np, crops, block_size)
+      out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
   def testBlockSizeOne(self):
@@ -68,7 +88,7 @@ class BatchToSpaceErrorHandlingTest(tf.test.TestCase):
     crops = np.zeros((2, 2), dtype=np.int32)
     block_size = 1
     with self.assertRaises(ValueError):
-      out_tf = tf.batch_to_space(x_np, crops, block_size)
+      out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
   def testBlockSizeLarger(self):
@@ -77,7 +97,7 @@ class BatchToSpaceErrorHandlingTest(tf.test.TestCase):
     crops = np.zeros((2, 2), dtype=np.int32)
     block_size = 10
     with self.assertRaises(ValueError):
-      out_tf = tf.batch_to_space(x_np, crops, block_size)
+      out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
   def testBlockSizeSquaredNotDivisibleBatch(self):
@@ -86,22 +106,122 @@ class BatchToSpaceErrorHandlingTest(tf.test.TestCase):
     crops = np.zeros((2, 2), dtype=np.int32)
     block_size = 3
     with self.assertRaises(ValueError):
-      _ = tf.batch_to_space(x_np, crops, block_size)
+      _ = self.batch_to_space(x_np, crops, block_size)
 
   def testUnknownShape(self):
-    t = tf.batch_to_space(tf.placeholder(tf.float32), tf.placeholder(tf.int32),
-                          block_size=4)
+    t = self.batch_to_space(
+        tf.placeholder(tf.float32),
+        tf.placeholder(tf.int32),
+        block_size=4)
     self.assertEqual(4, t.get_shape().ndims)
 
 
-class BatchToSpaceGradientTest(tf.test.TestCase):
+class BatchToSpaceErrorHandlingCppTest(BatchToSpaceErrorHandlingTest,
+                                       CppOpImpl):
+  pass
+
+
+class BatchToSpaceNDErrorHandlingTest(tf.test.TestCase):
+
+  def _testStaticShape(self, input_shape, block_shape, paddings, error):
+    block_shape = np.array(block_shape)
+    paddings = np.array(paddings)
+
+    # Try with sizes known at graph construction time.
+    with self.assertRaises(error):
+      _ = tf.batch_to_space_nd(
+          np.zeros(input_shape, np.float32), block_shape, paddings)
+
+  def _testDynamicShape(self, input_shape, block_shape, paddings):
+    block_shape = np.array(block_shape)
+    paddings = np.array(paddings)
+
+    # Try with sizes unknown at graph construction time.
+    input_placeholder = tf.placeholder(tf.float32)
+    block_shape_placeholder = tf.placeholder(tf.int32, shape=block_shape.shape)
+    paddings_placeholder = tf.placeholder(tf.int32)
+    t = tf.batch_to_space_nd(input_placeholder, block_shape_placeholder,
+                             paddings_placeholder)
+
+    with self.assertRaises(ValueError):
+      _ = t.eval({input_placeholder: np.zeros(input_shape, np.float32),
+                  block_shape_placeholder: block_shape,
+                  paddings_placeholder: paddings})
+
+  def _testShape(self, input_shape, block_shape, paddings, error):
+    self._testStaticShape(input_shape, block_shape, paddings, error)
+    self._testDynamicShape(input_shape, block_shape, paddings)
+
+  def testInputWrongDimMissingBatch(self):
+    self._testShape([2, 2], [2, 2], [[0, 0], [0, 0]], ValueError)
+    self._testShape([2, 2, 3], [2, 2, 3], [[0, 0], [0, 0]], ValueError)
+
+  def testBlockSize0(self):
+    # The block size is 0.
+    self._testShape([1, 2, 2, 1], [0, 1], [[0, 0], [0, 0]], ValueError)
+
+  def testBlockSizeNegative(self):
+    self._testShape([1, 2, 2, 1], [-1, 1], [[0, 0], [0, 0]], ValueError)
+
+  def testNegativePadding(self):
+    self._testShape([1, 2, 2], [1, 1], [[0, -1], [0, 0]], ValueError)
+
+  def testCropTooLarge(self):
+    # The amount to crop exceeds the padded size.
+    self._testShape([1 * 2 * 2, 2, 3, 1], [2, 2], [[3, 2], [0, 0]], ValueError)
+
+  def testBlockSizeSquaredNotDivisibleBatch(self):
+    # The batch dimension is not divisible by the product of the block_shape.
+    self._testShape([3, 1, 1, 1], [2, 3], [[0, 0], [0, 0]], ValueError)
+
+  def testUnknownShape(self):
+    # Verify that input shape and paddings shape can be unknown.
+    _ = tf.batch_to_space_nd(
+        tf.placeholder(tf.float32),
+        tf.placeholder(tf.int32, shape=(2,)),
+        tf.placeholder(tf.int32))
+
+    # Only number of input dimensions is known.
+    t = tf.batch_to_space_nd(
+        tf.placeholder(tf.float32, shape=(None, None, None, None)),
+        tf.placeholder(tf.int32, shape=(2,)),
+        tf.placeholder(tf.int32))
+    self.assertEqual(4, t.get_shape().ndims)
+
+    # Dimensions are partially known.
+    t = tf.batch_to_space_nd(
+        tf.placeholder(tf.float32, shape=(None, None, None, 2)),
+        tf.placeholder(tf.int32, shape=(2,)),
+        tf.placeholder(tf.int32))
+    self.assertEqual([None, None, None, 2], t.get_shape().as_list())
+
+    # Dimensions are partially known.
+    t = tf.batch_to_space_nd(
+        tf.placeholder(tf.float32, shape=(3 * 2 * 3, None, None, 2)), [2, 3],
+        tf.placeholder(tf.int32))
+    self.assertEqual([3, None, None, 2], t.get_shape().as_list())
+
+    # Dimensions are partially known.
+    t = tf.batch_to_space_nd(
+        tf.placeholder(tf.float32, shape=(3 * 2 * 3, None, 2, 2)), [2, 3],
+        [[1, 1], [0, 1]])
+    self.assertEqual([3, None, 5, 2], t.get_shape().as_list())
+
+    # Dimensions are fully known.
+    t = tf.batch_to_space_nd(
+        tf.placeholder(tf.float32, shape=(3 * 2 * 3, 2, 1, 2)), [2, 3],
+        [[1, 1], [0, 0]])
+    self.assertEqual([3, 2, 3, 2], t.get_shape().as_list())
+
+
+class BatchToSpaceGradientTest(tf.test.TestCase, PythonOpImpl):
 
   # Check the gradients.
   def _checkGrad(self, x, crops, block_size):
     assert 4 == x.ndim
     with self.test_session():
       tf_x = tf.convert_to_tensor(x)
-      tf_y = tf.batch_to_space(tf_x, crops, block_size)
+      tf_y = self.batch_to_space(tf_x, crops, block_size)
       epsilon = 1e-5
       ((x_jacob_t, x_jacob_n)) = tf.test.compute_gradient(
           tf_x,
@@ -117,9 +237,9 @@ class BatchToSpaceGradientTest(tf.test.TestCase):
   # tensor of shape [b * block_size * block_size, h, w, d].
   def _compare(self, b, h, w, d, block_size, crop_beg, crop_end):
     block_size_sq = block_size * block_size
-    x = np.random.normal(
-        0, 1, b * h * w * d * block_size_sq).astype(np.float32).reshape(
-            [b * block_size * block_size, h, w, d])
+    x = np.random.normal(0, 1, b * h * w * d *
+                         block_size_sq).astype(np.float32).reshape(
+                             [b * block_size * block_size, h, w, d])
     crops = np.array([[crop_beg, crop_end], [crop_beg, crop_end]],
                      dtype=np.int32)
 
@@ -146,5 +266,48 @@ class BatchToSpaceGradientTest(tf.test.TestCase):
     self._compare(1, 2, 3, 5, block_size, crop_beg, crop_end)
 
 
+class BatchToSpaceGradientCppTest(BatchToSpaceGradientTest, CppOpImpl):
+  pass
+
+
+class BatchToSpaceNDGradientTest(tf.test.TestCase):
+
+  # Check the gradients.
+  def _checkGrad(self, x, block_shape, crops):
+    block_shape = np.array(block_shape)
+    crops = np.array(crops).reshape((len(block_shape), 2))
+    with self.test_session():
+      tf_x = tf.convert_to_tensor(x)
+      tf_y = tf.batch_to_space_nd(tf_x, block_shape, crops)
+      epsilon = 1e-5
+      ((x_jacob_t, x_jacob_n)) = tf.test.compute_gradient(
+          tf_x,
+          x.shape,
+          tf_y,
+          tf_y.get_shape().as_list(),
+          x_init_value=x,
+          delta=epsilon)
+
+    self.assertAllClose(x_jacob_t, x_jacob_n, rtol=1e-2, atol=epsilon)
+
+  def _compare(self, input_shape, block_shape, crops):
+    input_shape = list(input_shape)
+    input_shape[0] *= np.prod(block_shape)
+    x = np.random.normal(
+        0, 1, np.prod(input_shape)).astype(np.float32).reshape(input_shape)
+    self._checkGrad(x, block_shape, crops)
+
+  # Don't use very large numbers as dimensions here as the result is tensor
+  # with cartesian product of the dimensions.
+  def testSmall(self):
+    self._compare([1, 2, 3, 5], [2, 2], [[0, 0], [0, 0]])
+
+  def testSmall2(self):
+    self._compare([2, 4, 3, 2], [2, 2], [[0, 0], [0, 0]])
+
+  def testSmallCrop1x1(self):
+    self._compare([1, 2, 3, 5], [2, 2], [[1, 1], [1, 1]])
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 1b82d6baa75..24df91c34f7 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -277,6 +277,13 @@ class UniformUnitScalingInitializationTest(tf.test.TestCase):
       self.assertFalse(identicaltest(self, init1, init3))
       self.assertFalse(identicaltest(self, init2, init3))
 
+  def testZeroSize(self):
+    shape = [0, 2]
+    with self.test_session():
+      x = tf.get_variable("x", shape=shape,
+                          initializer=tf.uniform_unit_scaling_initializer())
+      self.assertAllEqual(shape, x.eval().shape)
+
   def testDuplicatedInitializer(self):
     init = tf.uniform_unit_scaling_initializer()
     self.assertFalse(duplicated_initializer(self, init, 1))
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index f314712d7cb..62a6613a73e 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -98,6 +98,13 @@ class PyOpTest(tf.test.TestCase):
       self.assertAllClose(y.eval(), 0.0)
       self.assertAllClose(z.eval(), 1.0)
 
+    # returns a tuple, Tout and inp a tuple
+    with self.test_session():
+      x = tf.constant(0.0, tf.float64)
+      y, z = tf.py_func(tuple_func, (x,), (tf.float64, tf.float64))
+      self.assertAllClose(y.eval(), 0.0)
+      self.assertAllClose(z.eval(), 1.0)
+
   def testStrings(self):
 
     def read_fixed_length_numpy_strings():
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index dd97524357d..5a0a4d71ea3 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -450,6 +450,60 @@ class TFRecordReaderTest(tf.test.TestCase):
       self.assertEqual(self._num_files * self._num_records, num_k)
       self.assertEqual(self._num_files * self._num_records, num_v)
 
+  def testReadZlibFiles(self):
+    files = self._CreateFiles()
+    zlib_files = []
+    for i, fn in enumerate(files):
+      with open(fn, "rb") as f:
+        cdata = zlib.compress(f.read())
+
+        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
+        with open(zfn, "wb") as f:
+          f.write(cdata)
+        zlib_files.append(zfn)
+
+    with self.test_session() as sess:
+      options = tf.python_io.TFRecordOptions(TFRecordCompressionType.ZLIB)
+      reader = tf.TFRecordReader(name="test_reader", options=options)
+      queue = tf.FIFOQueue(99, [tf.string], shapes=())
+      key, value = reader.read(queue)
+
+      queue.enqueue_many([zlib_files]).run()
+      queue.close().run()
+      for i in range(self._num_files):
+        for j in range(self._num_records):
+          k, v = sess.run([key, value])
+          self.assertTrue(
+              tf.compat.as_text(k).startswith("%s:" % zlib_files[i]))
+          self.assertAllEqual(self._Record(i, j), v)
+
+  def testReadGzipFiles(self):
+    files = self._CreateFiles()
+    gzip_files = []
+    for i, fn in enumerate(files):
+      with open(fn, "rb") as f:
+        cdata = f.read()
+
+        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
+        with gzip.GzipFile(zfn, "wb") as f:
+          f.write(cdata)
+        gzip_files.append(zfn)
+
+    with self.test_session() as sess:
+      options = tf.python_io.TFRecordOptions(TFRecordCompressionType.GZIP)
+      reader = tf.TFRecordReader(name="test_reader", options=options)
+      queue = tf.FIFOQueue(99, [tf.string], shapes=())
+      key, value = reader.read(queue)
+
+      queue.enqueue_many([gzip_files]).run()
+      queue.close().run()
+      for i in range(self._num_files):
+        for j in range(self._num_records):
+          k, v = sess.run([key, value])
+          self.assertTrue(
+              tf.compat.as_text(k).startswith("%s:" % gzip_files[i]))
+          self.assertAllEqual(self._Record(i, j), v)
+
 
 class TFRecordWriterZlibTest(tf.test.TestCase):
 
@@ -488,7 +542,7 @@ class TFRecordWriterZlibTest(tf.test.TestCase):
   def _ZlibCompressFile(self, infile, name="tfrecord.z"):
     # zlib compress the file and write compressed contents to file.
     with open(infile, "rb") as f:
-      cdata = zlib.compress(f.read(), 6)
+      cdata = zlib.compress(f.read())
 
     zfn = os.path.join(self.get_temp_dir(), name)
     with open(zfn, "wb") as f:
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 5a2a283e2db..c3a7a91a1ef 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import numpy as np
 import tensorflow as tf
 
+from tensorflow.python.util.all_util import reveal_undocumented
+
 
 class SegmentReductionHelper(tf.test.TestCase):
 
@@ -349,6 +351,12 @@ class SparseSegmentReductionHelper(SegmentReductionHelper):
 
 class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
 
+  def setUp(self):
+    reveal_undocumented("tensorflow.python."
+                        "sparse_segment_mean_grad", tf)
+    reveal_undocumented("tensorflow.python."
+                        "sparse_segment_sqrt_n_grad", tf)
+
   def testValues(self):
     dtypes = [tf.float32,
               tf.float64,
diff --git a/tensorflow/python/kernel_tests/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
index 9f346aa3def..4e14d7c5c54 100644
--- a/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
@@ -22,19 +22,89 @@ from __future__ import print_function
 import numpy as np
 import tensorflow as tf
 
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import gen_array_ops
 
-class SpaceToBatchTest(tf.test.TestCase):
-  """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops."""
+
+def space_to_batch_direct(input_array, block_shape, paddings):
+  """Direct Python implementation of space-to-batch conversion.
+
+  This is used for tests only.
+
+  Args:
+    input_array: N-D array
+    block_shape: 1-D array of shape [num_block_dims].
+    paddings: 2-D array of shape [num_block_dims, 2].
+
+  Returns:
+    Converted tensor.
+  """
+  input_array = np.array(input_array)
+  block_shape = np.array(block_shape)
+  num_block_dims = len(block_shape)
+  paddings = np.array(paddings).reshape((len(block_shape), 2))
+
+  padded = np.pad(input_array,
+                  pad_width=([[0, 0]] + list(paddings) + [[0, 0]] *
+                             (input_array.ndim - 1 - num_block_dims)),
+                  mode="constant")
+  reshaped_padded_shape = [input_array.shape[0]]
+  output_shape = [input_array.shape[0] * np.prod(block_shape)]
+  for block_dim, block_shape_value in enumerate(block_shape):
+    reduced_size = padded.shape[block_dim + 1] // block_shape_value
+    reshaped_padded_shape.append(reduced_size)
+    output_shape.append(reduced_size)
+    reshaped_padded_shape.append(block_shape_value)
+  reshaped_padded_shape.extend(input_array.shape[num_block_dims + 1:])
+  output_shape.extend(input_array.shape[num_block_dims + 1:])
+
+  reshaped_padded = padded.reshape(reshaped_padded_shape)
+  permuted_reshaped_padded = np.transpose(reshaped_padded, (
+      list(np.arange(num_block_dims) * 2 + 2) + [0] +
+      list(np.arange(num_block_dims) * 2 + 1) + list(np.arange(
+          input_array.ndim - num_block_dims - 1) + 1 + num_block_dims * 2)))
+  return permuted_reshaped_padded.reshape(output_shape)
+
+
+class PythonOpImpl(object):
+
+  @staticmethod
+  def space_to_batch(*args, **kwargs):
+    return tf.space_to_batch(*args, **kwargs)
+
+  @staticmethod
+  def batch_to_space(*args, **kwargs):
+    return tf.batch_to_space(*args, **kwargs)
+
+
+class CppOpImpl(object):
+
+  @staticmethod
+  def space_to_batch(*args, **kwargs):
+    return gen_array_ops._space_to_batch(*args, **kwargs)
+
+  @staticmethod
+  def batch_to_space(*args, **kwargs):
+    return gen_array_ops._batch_to_space(*args, **kwargs)
+
+
+class SpaceToBatchTest(tf.test.TestCase, PythonOpImpl):
+  """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops.
+
+  This uses the Python compatibility wrapper that forwards to space_to_batch_nd.
+  """
 
   def _testPad(self, inputs, paddings, block_size, outputs):
     with self.test_session(use_gpu=True):
       # outputs = space_to_batch(inputs)
-      x_tf = tf.space_to_batch(
-          tf.to_float(inputs), paddings, block_size=block_size)
+      x_tf = self.space_to_batch(
+          tf.to_float(inputs),
+          paddings, block_size=block_size)
       self.assertAllEqual(x_tf.eval(), outputs)
       # inputs = batch_to_space(outputs)
-      x_tf = tf.batch_to_space(
-          tf.to_float(outputs), paddings, block_size=block_size)
+      x_tf = self.batch_to_space(
+          tf.to_float(outputs),
+          paddings, block_size=block_size)
       self.assertAllEqual(x_tf.eval(), inputs)
 
   def _testOne(self, inputs, block_size, outputs):
@@ -117,23 +187,140 @@ class SpaceToBatchTest(tf.test.TestCase):
     self._testOne(x_np, block_size, x_out)
 
 
-class SpaceToBatchSpaceToDepth(tf.test.TestCase):
+class SpaceToBatchCppTest(SpaceToBatchTest, CppOpImpl):
+  """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops.
+
+  This uses the C++ ops.
+  """
+  pass
+
+
+class SpaceToBatchNDTest(tf.test.TestCase):
+  """Tests input-output pairs for the SpaceToBatchND and BatchToSpaceND ops."""
+
+  def _testPad(self, inputs, block_shape, paddings, outputs):
+    block_shape = np.array(block_shape)
+    paddings = np.array(paddings).reshape((len(block_shape), 2))
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu):
+        # outputs = space_to_batch(inputs)
+        x_tf = tf.space_to_batch_nd(tf.to_float(inputs), block_shape, paddings)
+        self.assertAllEqual(x_tf.eval(), outputs)
+        # inputs = batch_to_space(outputs)
+        x_tf = tf.batch_to_space_nd(tf.to_float(outputs), block_shape, paddings)
+        self.assertAllEqual(x_tf.eval(), inputs)
+
+  def _testDirect(self, input_shape, block_shape, paddings):
+    inputs = np.arange(np.prod(input_shape), dtype=np.float32)
+    inputs = inputs.reshape(input_shape)
+    self._testPad(inputs, block_shape, paddings,
+                  space_to_batch_direct(inputs, block_shape, paddings))
+
+  def testZeroBlockDimsZeroRemainingDims(self):
+    self._testPad(inputs=[1, 2],
+                  block_shape=[],
+                  paddings=[],
+                  outputs=[1, 2],)
+
+  def testZeroBlockDimsOneRemainingDim(self):
+    self._testPad(inputs=[[1, 2], [3, 4]],
+                  block_shape=[],
+                  paddings=[],
+                  outputs=[[1, 2], [3, 4]])
+
+    # Same thing, but with a no-op block dim.
+    self._testPad(inputs=[[1, 2], [3, 4]],
+                  block_shape=[1],
+                  paddings=[[0, 0]],
+                  outputs=[[1, 2], [3, 4]])
+
+  def testZeroBlockDimsTwoRemainingDims(self):
+    self._testPad(inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  block_shape=[],
+                  paddings=[],
+                  outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+
+    # Same thing, but with a no-op block dim.
+    self._testPad(inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  block_shape=[1],
+                  paddings=[[0, 0]],
+                  outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+
+    # Same thing, but with two no-op block dims.
+    self._testPad(inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  block_shape=[1, 1],
+                  paddings=[[0, 0], [0, 0]],
+                  outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+
+  def testOneBlockDimZeroRemainingDims(self):
+    self._testPad(inputs=[[1, 2, 3], [4, 5, 6]],
+                  block_shape=[2],
+                  paddings=[1, 0],
+                  outputs=[[0, 2], [0, 5], [1, 3], [4, 6]])
+
+  def testOneBlockDimOneRemainingDim(self):
+    self._testPad(
+        inputs=[[[1, 11], [2, 21], [3, 31]], [[4, 41], [5, 51], [6, 61]]],
+        block_shape=[2],
+        paddings=[1, 0],
+        outputs=[[[0, 0], [2, 21]], [[0, 0], [5, 51]], [[1, 11], [3, 31]],
+                 [[4, 41], [6, 61]]])
+
+  def testDirect(self):
+    # Test with zero-size remaining dimension.
+    self._testDirect(input_shape=[3, 1, 2, 0],
+                     block_shape=[3],
+                     paddings=[[0, 2]])
+
+    # Test with zero-size blocked dimension.
+    self._testDirect(input_shape=[3, 0, 2, 5],
+                     block_shape=[3],
+                     paddings=[[0, 0]])
+
+    # Test with padding up from zero size.
+    self._testDirect(input_shape=[3, 0, 2, 5],
+                     block_shape=[3],
+                     paddings=[[1, 2]])
+
+    self._testDirect(input_shape=[3, 3, 4, 5, 2],
+                     block_shape=[3, 4, 2],
+                     paddings=[[1, 2], [0, 0], [3, 0]])
+
+    self._testDirect(input_shape=[3, 3, 4, 5, 2],
+                     block_shape=[3, 4, 2, 2],
+                     paddings=[[1, 2], [0, 0], [3, 0], [0, 0]])
+
+    self._testDirect(input_shape=[3, 2, 2, 3, 4, 5, 2, 5],
+                     block_shape=[1, 1, 3, 4, 2, 2],
+                     paddings=[[0, 0], [0, 0], [1, 2], [0, 0], [3, 0], [0, 0]])
+
+    self._testDirect(
+        input_shape=[3, 2, 2, 3, 4, 5, 2, 5],
+        block_shape=[1, 1, 3, 4, 2, 2, 1],
+        paddings=[[0, 0], [0, 0], [1, 2], [0, 0], [3, 0], [0, 0], [0, 0]])
+
+
+class SpaceToBatchSpaceToDepth(tf.test.TestCase, PythonOpImpl):
 
   # Verifies that: space_to_batch(x) = transpose(space_to_depth(transpose(x)))
   def testSpaceToDepthTranspose(self):
     x = np.arange(5 * 10 * 16 * 7, dtype=np.float32).reshape([5, 10, 16, 7])
     block_size = 2
     paddings = np.zeros((2, 2), dtype=np.int32)
-    y1 = tf.space_to_batch(x, paddings, block_size=block_size)
+    y1 = self.space_to_batch(x, paddings, block_size=block_size)
     y2 = tf.transpose(
         tf.space_to_depth(
-            tf.transpose(x, [3, 1, 2, 0]), block_size=block_size),
-        [3, 1, 2, 0])
+            tf.transpose(x, [3, 1, 2, 0]),
+            block_size=block_size), [3, 1, 2, 0])
     with self.test_session(use_gpu=True):
       self.assertAllEqual(y1.eval(), y2.eval())
 
 
-class SpaceToBatchErrorHandlingTest(tf.test.TestCase):
+class SpaceToBatchSpaceToDepthCpp(SpaceToBatchSpaceToDepth, CppOpImpl):
+  pass
+
+
+class SpaceToBatchErrorHandlingTest(tf.test.TestCase, PythonOpImpl):
 
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
@@ -141,7 +328,7 @@ class SpaceToBatchErrorHandlingTest(tf.test.TestCase):
     paddings = np.zeros((2, 2), dtype=np.int32)
     block_size = 2
     with self.assertRaises(ValueError):
-      _ = tf.space_to_batch(x_np, paddings, block_size)
+      _ = self.space_to_batch(x_np, paddings, block_size)
 
   def testBlockSize0(self):
     # The block size is 0.
@@ -149,7 +336,7 @@ class SpaceToBatchErrorHandlingTest(tf.test.TestCase):
     paddings = np.zeros((2, 2), dtype=np.int32)
     block_size = 0
     with self.assertRaises(ValueError):
-      out_tf = tf.space_to_batch(x_np, paddings, block_size)
+      out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
   def testBlockSizeOne(self):
@@ -158,7 +345,7 @@ class SpaceToBatchErrorHandlingTest(tf.test.TestCase):
     paddings = np.zeros((2, 2), dtype=np.int32)
     block_size = 1
     with self.assertRaises(ValueError):
-      out_tf = tf.space_to_batch(x_np, paddings, block_size)
+      out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
   def testBlockSizeLarger(self):
@@ -167,7 +354,7 @@ class SpaceToBatchErrorHandlingTest(tf.test.TestCase):
     paddings = np.zeros((2, 2), dtype=np.int32)
     block_size = 10
     with self.assertRaises(ValueError):
-      out_tf = tf.space_to_batch(x_np, paddings, block_size)
+      out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
   def testBlockSizeNotDivisibleWidth(self):
@@ -176,7 +363,7 @@ class SpaceToBatchErrorHandlingTest(tf.test.TestCase):
     paddings = np.zeros((2, 2), dtype=np.int32)
     block_size = 3
     with self.assertRaises(ValueError):
-      _ = tf.space_to_batch(x_np, paddings, block_size)
+      _ = self.space_to_batch(x_np, paddings, block_size)
 
   def testBlockSizeNotDivisibleHeight(self):
     # The block size divides height but not width.
@@ -184,7 +371,7 @@ class SpaceToBatchErrorHandlingTest(tf.test.TestCase):
     paddings = np.zeros((2, 2), dtype=np.int32)
     block_size = 3
     with self.assertRaises(ValueError):
-      _ = tf.space_to_batch(x_np, paddings, block_size)
+      _ = self.space_to_batch(x_np, paddings, block_size)
 
   def testBlockSizeNotDivisibleBoth(self):
     # The block size does not divide neither width or height.
@@ -192,22 +379,118 @@ class SpaceToBatchErrorHandlingTest(tf.test.TestCase):
     paddings = np.zeros((2, 2), dtype=np.int32)
     block_size = 3
     with self.assertRaises(ValueError):
-      _ = tf.space_to_batch(x_np, paddings, block_size)
+      _ = self.space_to_batch(x_np, paddings, block_size)
 
   def testUnknownShape(self):
-    t = tf.space_to_batch(tf.placeholder(tf.float32), tf.placeholder(tf.int32),
-                          block_size=4)
+    t = self.space_to_batch(
+        tf.placeholder(tf.float32),
+        tf.placeholder(tf.int32),
+        block_size=4)
     self.assertEqual(4, t.get_shape().ndims)
 
 
-class SpaceToBatchGradientTest(tf.test.TestCase):
+class SpaceToBatchErrorHandlingCppTest(SpaceToBatchErrorHandlingTest,
+                                       CppOpImpl):
+  pass
+
+
+class SpaceToBatchNDErrorHandlingTest(tf.test.TestCase):
+
+  def _testStaticShape(self, input_shape, block_shape, paddings, error):
+    block_shape = np.array(block_shape)
+    paddings = np.array(paddings)
+
+    # Try with sizes known at graph construction time.
+    with self.assertRaises(error):
+      _ = tf.space_to_batch_nd(
+          np.zeros(input_shape, np.float32), block_shape, paddings)
+
+  def _testDynamicShape(self, input_shape, block_shape, paddings):
+    block_shape = np.array(block_shape)
+    paddings = np.array(paddings)
+    # Try with sizes unknown at graph construction time.
+    input_placeholder = tf.placeholder(tf.float32)
+    block_shape_placeholder = tf.placeholder(tf.int32, shape=block_shape.shape)
+    paddings_placeholder = tf.placeholder(tf.int32)
+    t = tf.space_to_batch_nd(input_placeholder, block_shape_placeholder,
+                             paddings_placeholder)
+
+    with self.assertRaises(ValueError):
+      _ = t.eval({input_placeholder: np.zeros(input_shape, np.float32),
+                  block_shape_placeholder: block_shape,
+                  paddings_placeholder: paddings})
+
+  def _testShape(self, input_shape, block_shape, paddings, error):
+    self._testStaticShape(input_shape, block_shape, paddings, error)
+    self._testDynamicShape(input_shape, block_shape, paddings)
+
+  def testBlockSize0(self):
+    # The block size is 0.
+    self._testShape([1, 2, 2], [0, 2], [[0, 0], [0, 0]], ValueError)
+
+  def testBlockSizeNegative(self):
+    self._testShape([1, 2, 2], [-1, 2], [[0, 0], [0, 0]], ValueError)
+
+  def testNegativePadding(self):
+    # The padding is negative.
+    self._testShape([1, 2, 2], [1, 1], [[0, -1], [0, 0]], ValueError)
+
+  def testBlockSizeNotDivisible(self):
+    # The padded size is not divisible by the block size.
+    self._testShape([1, 2, 3, 1], [3, 3], [[0, 0], [0, 0]], ValueError)
+
+  def testBlockDimsMismatch(self):
+    # Shape of block_shape does not match shape of paddings.
+    self._testStaticShape([1, 3, 3, 1], [3, 3], [[0, 0]], ValueError)
+
+  def testUnknown(self):
+    # Verify that input shape and paddings shape can be unknown.
+    _ = tf.space_to_batch_nd(
+        tf.placeholder(tf.float32),
+        tf.placeholder(tf.int32, shape=(2,)),
+        tf.placeholder(tf.int32))
+
+    # Only number of input dimensions is known.
+    t = tf.space_to_batch_nd(
+        tf.placeholder(tf.float32, shape=(None, None, None, None)),
+        tf.placeholder(tf.int32, shape=(2,)),
+        tf.placeholder(tf.int32))
+    self.assertEqual(4, t.get_shape().ndims)
+
+    # Dimensions are partially known.
+    t = tf.space_to_batch_nd(
+        tf.placeholder(tf.float32, shape=(None, None, None, 2)),
+        tf.placeholder(tf.int32, shape=(2,)),
+        tf.placeholder(tf.int32))
+    self.assertEqual([None, None, None, 2], t.get_shape().as_list())
+
+    # Dimensions are partially known.
+    t = tf.space_to_batch_nd(
+        tf.placeholder(tf.float32, shape=(3, None, None, 2)), [2, 3],
+        tf.placeholder(tf.int32))
+    self.assertEqual([3 * 2 * 3, None, None, 2], t.get_shape().as_list())
+
+    # Dimensions are partially known.
+    t = tf.space_to_batch_nd(
+        tf.placeholder(tf.float32, shape=(3, None, 2, 2)), [2, 3],
+        [[1, 1], [0, 1]])
+    self.assertEqual([3 * 2 * 3, None, 1, 2], t.get_shape().as_list())
+
+    # Dimensions are fully known.
+    t = tf.space_to_batch_nd(
+        tf.placeholder(tf.float32, shape=(3, 2, 3, 2)), [2, 3],
+        [[1, 1], [0, 0]])
+    self.assertEqual([3 * 2 * 3, 2, 1, 2], t.get_shape().as_list())
+
+
+class SpaceToBatchGradientTest(tf.test.TestCase, PythonOpImpl):
 
   # Check the gradients.
   def _checkGrad(self, x, paddings, block_size):
     assert 4 == x.ndim
     with self.test_session(use_gpu=True):
       tf_x = tf.convert_to_tensor(x)
-      tf_y = tf.space_to_batch(tf_x, paddings, block_size)
+      tf_y = self.space_to_batch(tf_x, paddings, block_size)
       epsilon = 1e-5
       ((x_jacob_t, x_jacob_n)) = tf.test.compute_gradient(
           tf_x,
@@ -223,9 +506,9 @@ class SpaceToBatchGradientTest(tf.test.TestCase):
   # tensor of shape [b, h * block_size, w * block_size, d].
   def _compare(self, b, h, w, d, block_size, pad_beg, pad_end):
     block_size_sq = block_size * block_size
-    x = np.random.normal(
-        0, 1, b * h * w * d * block_size_sq).astype(np.float32).reshape(
-            [b, h * block_size, w * block_size, d])
+    x = np.random.normal(0, 1, b * h * w * d *
+                         block_size_sq).astype(np.float32).reshape(
+                             [b, h * block_size, w * block_size, d])
     paddings = np.array([[pad_beg, pad_end], [pad_beg, pad_end]],
                         dtype=np.int32)
 
@@ -252,5 +535,122 @@ class SpaceToBatchGradientTest(tf.test.TestCase):
     self._compare(1, 2, 3, 5, block_size, pad_beg, pad_end)
 
 
+class SpaceToBatchGradientCppTest(SpaceToBatchGradientTest, CppOpImpl):
+  pass
+
+
+class SpaceToBatchNDGradientTest(tf.test.TestCase):
+
+  # Check the gradients.
+  def _checkGrad(self, x, block_shape, paddings):
+    block_shape = np.array(block_shape)
+    paddings = np.array(paddings).reshape((len(block_shape), 2))
+    with self.test_session():
+      tf_x = tf.convert_to_tensor(x)
+      tf_y = tf.space_to_batch_nd(tf_x, block_shape, paddings)
+      epsilon = 1e-5
+      ((x_jacob_t, x_jacob_n)) = tf.test.compute_gradient(
+          tf_x,
+          x.shape,
+          tf_y,
+          tf_y.get_shape().as_list(),
+          x_init_value=x,
+          delta=epsilon)
+
+    self.assertAllClose(x_jacob_t, x_jacob_n, rtol=1e-2, atol=epsilon)
+
+  def _compare(self, input_shape, block_shape, paddings):
+    x = np.random.normal(
+        0, 1, np.prod(input_shape)).astype(np.float32).reshape(input_shape)
+    self._checkGrad(x, block_shape, paddings)
+
+  # Don't use very large numbers as dimensions here as the result is tensor
+  # with cartesian product of the dimensions.
+  def testSmall(self):
+    self._compare([1, 4, 6, 5], [2, 2], [[0, 0], [0, 0]])
+
+  def testSmall2(self):
+    self._compare([2, 8, 6, 2], [2, 2], [[0, 0], [0, 0]])
+
+  def testSmallPad1(self):
+    self._compare([2, 4, 6, 2], [2, 2], [[1, 1], [1, 1]])
+
+  def testSmallPadThreeBlockDims(self):
+    self._compare([2, 2, 4, 3, 2], [2, 2, 2], [[1, 1], [1, 1], [1, 0]])
+
+
+class RequiredSpaceToBatchPaddingsTest(tf.test.TestCase):
+
+  def _checkProperties(self, input_shape, block_shape, base_paddings, paddings,
+                       crops):
+    """Checks that `paddings` and `crops` satisfy invariants."""
+    num_block_dims = len(block_shape)
+    self.assertEqual(len(input_shape), num_block_dims)
+    if base_paddings is None:
+      base_paddings = np.zeros((num_block_dims, 2), np.int32)
+    self.assertEqual(base_paddings.shape, (num_block_dims, 2))
+    self.assertEqual(paddings.shape, (num_block_dims, 2))
+    self.assertEqual(crops.shape, (num_block_dims, 2))
+    for i in range(num_block_dims):
+      self.assertEqual(paddings[i, 0], base_paddings[i, 0])
+      self.assertLessEqual(0, paddings[i, 1] - base_paddings[i, 1])
+      self.assertLess(paddings[i, 1] - base_paddings[i, 1], block_shape[i])
+      self.assertEqual((input_shape[i] + paddings[i, 0] + paddings[i, 1]) %
+                       block_shape[i], 0)
+      self.assertEqual(crops[i, 0], 0)
+      self.assertEqual(crops[i, 1], paddings[i, 1] - base_paddings[i, 1])
+
+  def _test(self, input_shape, block_shape, base_paddings):
+    input_shape = np.array(input_shape)
+    block_shape = np.array(block_shape)
+    if base_paddings is not None:
+      base_paddings = np.array(base_paddings)
+    # Check with constants.
+    paddings, crops = tf.required_space_to_batch_paddings(
+        input_shape, block_shape, base_paddings)
+    paddings_const = tensor_util.constant_value(paddings)
+    crops_const = tensor_util.constant_value(crops)
+    self.assertIsNotNone(paddings_const)
+    self.assertIsNotNone(crops_const)
+    self._checkProperties(input_shape, block_shape, base_paddings,
+                          paddings_const, crops_const)
+    # Check with non-constants.
+    assignments = {}
+    input_shape_placeholder = tf.placeholder(tf.int32)
+    assignments[input_shape_placeholder] = input_shape
+    block_shape_placeholder = tf.placeholder(tf.int32, [len(block_shape)])
+    assignments[block_shape_placeholder] = block_shape
+    if base_paddings is not None:
+      base_paddings_placeholder = tf.placeholder(tf.int32,
+                                                 [len(block_shape), 2])
+      assignments[base_paddings_placeholder] = base_paddings
+    else:
+      base_paddings_placeholder = None
+    t_paddings, t_crops = tf.required_space_to_batch_paddings(
+        input_shape_placeholder, block_shape_placeholder,
+        base_paddings_placeholder)
+    with self.test_session():
+      paddings_result = t_paddings.eval(assignments)
+      crops_result = t_crops.eval(assignments)
+    self.assertAllEqual(paddings_result, paddings_const)
+    self.assertAllEqual(crops_result, crops_const)
+
+  def testSimple(self):
+    self._test(input_shape=np.zeros((0,), np.int32),
+               block_shape=np.zeros((0,), np.int32),
+               base_paddings=None)
+    self._test(input_shape=np.zeros((0,), np.int32),
+               block_shape=np.zeros((0,), np.int32),
+               base_paddings=np.zeros((0, 2), np.int32))
+    self._test(input_shape=[1], block_shape=[2], base_paddings=None)
+    self._test(input_shape=[1], block_shape=[2], base_paddings=[[1, 0]])
+    self._test(input_shape=[3], block_shape=[1], base_paddings=[[1, 2]])
+    self._test(input_shape=[1], block_shape=[2], base_paddings=[[2, 3]])
+    self._test(input_shape=[4, 5], block_shape=[3, 2], base_paddings=None)
+    self._test(input_shape=[4, 5],
+               block_shape=[3, 2],
+               base_paddings=[[0, 0], [0, 1]])
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_concat_op_test.py b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
index 1aa3f1d2c01..de48b969d9c 100644
--- a/tensorflow/python/kernel_tests/sparse_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
@@ -130,18 +130,21 @@ class SparseConcatTest(tf.test.TestCase):
       # [2    ]
       # [3   4]
       for sp_a in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
-        sp_concat = tf.sparse_concat(1, [sp_a])
+        # Note that we ignore concat_dim in this case since we short-circuit the
+        # single-input case in python.
+        for concat_dim in (-2000, 1, 2000):
+          sp_concat = tf.sparse_concat(concat_dim, [sp_a])
 
-        self.assertEqual(sp_concat.indices.get_shape(), [4, 2])
-        self.assertEqual(sp_concat.values.get_shape(), [4])
-        self.assertEqual(sp_concat.shape.get_shape(), [2])
+          self.assertEqual(sp_concat.indices.get_shape(), [4, 2])
+          self.assertEqual(sp_concat.values.get_shape(), [4])
+          self.assertEqual(sp_concat.shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+          concat_out = sess.run(sp_concat)
 
-        self.assertAllEqual(
-            concat_out.indices, [[0, 2], [1, 0], [2, 0], [2, 2]])
-        self.assertAllEqual(concat_out.values, [1, 2, 3, 4])
-        self.assertAllEqual(concat_out.shape, [3, 3])
+          self.assertAllEqual(concat_out.indices,
+                              [[0, 2], [1, 0], [2, 0], [2, 2]])
+          self.assertAllEqual(concat_out.values, [1, 2, 3, 4])
+          self.assertAllEqual(concat_out.shape, [3, 3])
 
   def testConcat2(self):
     with self.test_session(use_gpu=False) as sess:
@@ -151,19 +154,20 @@ class SparseConcatTest(tf.test.TestCase):
       # [3   4 2     1 0]
       for sp_a in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
         for sp_b in (self._SparseTensorValue_3x5(), self._SparseTensor_3x5()):
-          sp_concat = tf.sparse_concat(1, [sp_a, sp_b])
+          for concat_dim in (-1, 1):
+            sp_concat = tf.sparse_concat(concat_dim, [sp_a, sp_b])
 
-          self.assertEqual(sp_concat.indices.get_shape(), [8, 2])
-          self.assertEqual(sp_concat.values.get_shape(), [8])
-          self.assertEqual(sp_concat.shape.get_shape(), [2])
+            self.assertEqual(sp_concat.indices.get_shape(), [8, 2])
+            self.assertEqual(sp_concat.values.get_shape(), [8])
+            self.assertEqual(sp_concat.shape.get_shape(), [2])
 
-          concat_out = sess.run(sp_concat)
+            concat_out = sess.run(sp_concat)
 
-          self.assertAllEqual(
-              concat_out.indices,
-              [[0, 2], [1, 0], [1, 4], [2, 0], [2, 2], [2, 3], [2, 6], [2, 7]])
-          self.assertAllEqual(concat_out.values, [1, 2, 1, 3, 4, 2, 1, 0])
-          self.assertAllEqual(concat_out.shape, [3, 8])
+            self.assertAllEqual(concat_out.indices, [[0, 2], [1, 0], [1, 4],
+                                                     [2, 0], [2, 2], [2, 3],
+                                                     [2, 6], [2, 7]])
+            self.assertAllEqual(concat_out.values, [1, 2, 1, 3, 4, 2, 1, 0])
+            self.assertAllEqual(concat_out.shape, [3, 8])
 
   def testConcatDim0(self):
     with self.test_session(use_gpu=False) as sess:
@@ -176,21 +180,20 @@ class SparseConcatTest(tf.test.TestCase):
       sp_a = self._SparseTensor_3x3()
       sp_d = self._SparseTensor_2x3()
 
-      sp_concat = tf.sparse_concat(0, [sp_a, sp_d])
+      for concat_dim in (-2, 0):
+        sp_concat = tf.sparse_concat(concat_dim, [sp_a, sp_d])
 
-      self.assertEqual(sp_concat.indices.get_shape(), [7, 2])
-      self.assertEqual(sp_concat.values.get_shape(), [7])
-      self.assertEqual(sp_concat.shape.get_shape(), [2])
+        self.assertEqual(sp_concat.indices.get_shape(), [7, 2])
+        self.assertEqual(sp_concat.values.get_shape(), [7])
+        self.assertEqual(sp_concat.shape.get_shape(), [2])
 
-      concat_out = sess.run(sp_concat)
+        concat_out = sess.run(sp_concat)
 
-      self.assertAllEqual(
-          concat_out.indices,
-          [[0, 2], [1, 0], [2, 0], [2, 2], [3, 1], [4, 0], [4, 2]])
-      self.assertAllEqual(
-          concat_out.values, np.array([1, 2, 3, 4, 1, 1, 2]))
-      self.assertAllEqual(
-          concat_out.shape, np.array([5, 3]))
+        self.assertAllEqual(
+            concat_out.indices,
+            [[0, 2], [1, 0], [2, 0], [2, 2], [3, 1], [4, 0], [4, 2]])
+        self.assertAllEqual(concat_out.values, np.array([1, 2, 3, 4, 1, 1, 2]))
+        self.assertAllEqual(concat_out.shape, np.array([5, 3]))
 
   def testConcat3(self):
     with self.test_session(use_gpu=False) as sess:
@@ -202,20 +205,20 @@ class SparseConcatTest(tf.test.TestCase):
       sp_b = self._SparseTensor_3x5()
       sp_c = self._SparseTensor_3x2()
 
-      sp_concat = tf.sparse_concat(1, [sp_a, sp_b, sp_c])
+      for concat_dim in (-1, 1):
+        sp_concat = tf.sparse_concat(concat_dim, [sp_a, sp_b, sp_c])
 
-      self.assertEqual(sp_concat.indices.get_shape(), [10, 2])
-      self.assertEqual(sp_concat.values.get_shape(), [10])
-      self.assertEqual(sp_concat.shape.get_shape(), [2])
+        self.assertEqual(sp_concat.indices.get_shape(), [10, 2])
+        self.assertEqual(sp_concat.values.get_shape(), [10])
+        self.assertEqual(sp_concat.shape.get_shape(), [2])
 
-      concat_out = sess.run(sp_concat)
+        concat_out = sess.run(sp_concat)
 
-      self.assertAllEqual(
-          concat_out.indices,
-          [[0, 2], [1, 0], [1, 4], [1, 8], [2, 0], [2, 2], [2, 3], [2, 6],
-           [2, 7], [2, 8]])
-      self.assertAllEqual(concat_out.values, [1, 2, 1, 1, 3, 4, 2, 1, 0, 2])
-      self.assertAllEqual(concat_out.shape, [3, 10])
+        self.assertAllEqual(concat_out.indices, [[0, 2], [1, 0], [1, 4], [1, 8],
+                                                 [2, 0], [2, 2], [2, 3], [2, 6],
+                                                 [2, 7], [2, 8]])
+        self.assertAllEqual(concat_out.values, [1, 2, 1, 1, 3, 4, 2, 1, 0, 2])
+        self.assertAllEqual(concat_out.shape, [3, 10])
 
   def testConcatNonNumeric(self):
     with self.test_session(use_gpu=False) as sess:
@@ -226,20 +229,21 @@ class SparseConcatTest(tf.test.TestCase):
       sp_a = self._SparseTensor_String3x3()
       sp_b = self._SparseTensor_String3x5()
 
-      sp_concat = tf.sparse_concat(1, [sp_a, sp_b])
+      for concat_dim in (-1, 1):
+        sp_concat = tf.sparse_concat(concat_dim, [sp_a, sp_b])
 
-      self.assertEqual(sp_concat.indices.get_shape(), [8, 2])
-      self.assertEqual(sp_concat.values.get_shape(), [8])
-      self.assertEqual(sp_concat.shape.get_shape(), [2])
+        self.assertEqual(sp_concat.indices.get_shape(), [8, 2])
+        self.assertEqual(sp_concat.values.get_shape(), [8])
+        self.assertEqual(sp_concat.shape.get_shape(), [2])
 
-      concat_out = sess.run(sp_concat)
+        concat_out = sess.run(sp_concat)
 
-      self.assertAllEqual(
-          concat_out.indices,
-          [[0, 2], [1, 0], [1, 4], [2, 0], [2, 2], [2, 3], [2, 6], [2, 7]])
-      self.assertAllEqual(
-          concat_out.values, [b"a", b"b", b"e", b"c", b"d", b"f", b"g", b"h"])
-      self.assertAllEqual(concat_out.shape, [3, 8])
+        self.assertAllEqual(
+            concat_out.indices,
+            [[0, 2], [1, 0], [1, 4], [2, 0], [2, 2], [2, 3], [2, 6], [2, 7]])
+        self.assertAllEqual(concat_out.values,
+                            [b"a", b"b", b"e", b"c", b"d", b"f", b"g", b"h"])
+        self.assertAllEqual(concat_out.shape, [3, 8])
 
   def testMismatchedRank(self):
     with self.test_session(use_gpu=False):
@@ -247,8 +251,9 @@ class SparseConcatTest(tf.test.TestCase):
       sp_e = self._SparseTensor_2x3x4()
 
       # Rank mismatches can be caught at shape-inference time
-      with self.assertRaises(ValueError):
-        tf.sparse_concat(1, [sp_a, sp_e])
+      for concat_dim in (-1, 1):
+        with self.assertRaises(ValueError):
+          tf.sparse_concat(concat_dim, [sp_a, sp_e])
 
   def testMismatchedRankExpandNonconcatDim(self):
     with self.test_session(use_gpu=False):
@@ -257,8 +262,9 @@ class SparseConcatTest(tf.test.TestCase):
 
       # Rank mismatches should be caught at shape-inference time, even for
       # expand_nonconcat_dim=True.
-      with self.assertRaises(ValueError):
-        tf.sparse_concat(1, [sp_a, sp_e], expand_nonconcat_dim=True)
+      for concat_dim in (-1, 1):
+        with self.assertRaises(ValueError):
+          tf.sparse_concat(concat_dim, [sp_a, sp_e], expand_nonconcat_dim=True)
 
   def testMismatchedShapes(self):
     with self.test_session(use_gpu=False) as sess:
@@ -266,11 +272,12 @@ class SparseConcatTest(tf.test.TestCase):
       sp_b = self._SparseTensor_3x5()
       sp_c = self._SparseTensor_3x2()
       sp_d = self._SparseTensor_2x3()
-      sp_concat = tf.sparse_concat(1, [sp_a, sp_b, sp_c, sp_d])
+      for concat_dim in (-1, 1):
+        sp_concat = tf.sparse_concat(concat_dim, [sp_a, sp_b, sp_c, sp_d])
 
-      # Shape mismatches can only be caught when the op is run
-      with self.assertRaisesOpError("Input shapes must match"):
-        sess.run(sp_concat)
+        # Shape mismatches can only be caught when the op is run
+        with self.assertRaisesOpError("Input shapes must match"):
+          sess.run(sp_concat)
 
   def testMismatchedShapesExpandNonconcatDim(self):
     with self.test_session(use_gpu=False) as sess:
@@ -278,35 +285,31 @@ class SparseConcatTest(tf.test.TestCase):
       sp_b = self._SparseTensor_3x5()
       sp_c = self._SparseTensor_3x2()
       sp_d = self._SparseTensor_2x3()
-      sp_concat_dim0 = tf.sparse_concat(0, [sp_a, sp_b, sp_c, sp_d],
-                                        expand_nonconcat_dim=True)
-      sp_concat_dim1 = tf.sparse_concat(1, [sp_a, sp_b, sp_c, sp_d],
-                                        expand_nonconcat_dim=True)
+      for concat_dim0 in (-2, 0):
+        for concat_dim1 in (-1, 1):
+          sp_concat_dim0 = tf.sparse_concat(
+              concat_dim0, [sp_a, sp_b, sp_c, sp_d], expand_nonconcat_dim=True)
+          sp_concat_dim1 = tf.sparse_concat(
+              concat_dim1, [sp_a, sp_b, sp_c, sp_d], expand_nonconcat_dim=True)
 
-      sp_concat_dim0_out = sess.run(sp_concat_dim0)
-      sp_concat_dim1_out = sess.run(sp_concat_dim1)
+          sp_concat_dim0_out = sess.run(sp_concat_dim0)
+          sp_concat_dim1_out = sess.run(sp_concat_dim1)
 
-      self.assertAllEqual(
-          sp_concat_dim0_out.indices,
-          [[0, 2], [1, 0], [2, 0], [2, 2], [4, 1], [5, 0], [5, 3], [5, 4],
-           [7, 0], [8, 0], [9, 1], [10, 0], [10, 2]])
-      self.assertAllEqual(
-          sp_concat_dim0_out.values,
-          [1, 2, 3, 4, 1, 2, 1, 0, 1, 2, 1, 1, 2])
-      self.assertAllEqual(
-          sp_concat_dim0_out.shape,
-          [11, 5])
+          self.assertAllEqual(sp_concat_dim0_out.indices,
+                              [[0, 2], [1, 0], [2, 0], [2, 2], [4, 1], [5, 0],
+                               [5, 3], [5, 4], [7, 0], [8, 0], [9, 1], [10, 0],
+                               [10, 2]])
+          self.assertAllEqual(sp_concat_dim0_out.values,
+                              [1, 2, 3, 4, 1, 2, 1, 0, 1, 2, 1, 1, 2])
+          self.assertAllEqual(sp_concat_dim0_out.shape, [11, 5])
 
-      self.assertAllEqual(
-          sp_concat_dim1_out.indices,
-          [[0, 2], [0, 11], [1, 0], [1, 4], [1, 8], [1, 10], [1, 12], [2, 0],
-           [2, 2], [2, 3], [2, 6], [2, 7], [2, 8]])
-      self.assertAllEqual(
-          sp_concat_dim1_out.values,
-          [1, 1, 2, 1, 1, 1, 2, 3, 4, 2, 1, 0, 2])
-      self.assertAllEqual(
-          sp_concat_dim1_out.shape,
-          [3, 13])
+          self.assertAllEqual(sp_concat_dim1_out.indices,
+                              [[0, 2], [0, 11], [1, 0], [1, 4], [1, 8], [1, 10],
+                               [1, 12], [2, 0], [2, 2], [2, 3], [2, 6], [2, 7],
+                               [2, 8]])
+          self.assertAllEqual(sp_concat_dim1_out.values,
+                              [1, 1, 2, 1, 1, 1, 2, 3, 4, 2, 1, 0, 2])
+          self.assertAllEqual(sp_concat_dim1_out.shape, [3, 13])
 
   def testShapeInferenceUnknownShapes(self):
     with self.test_session(use_gpu=False):
@@ -316,11 +319,12 @@ class SparseConcatTest(tf.test.TestCase):
           self._SparseTensor_UnknownShape(ind_shape=[1, 3]),
           self._SparseTensor_UnknownShape(shape_shape=[3])]
 
-      sp_concat = tf.sparse_concat(0, sp_inputs)
+      for concat_dim in (-2, 0):
+        sp_concat = tf.sparse_concat(concat_dim, sp_inputs)
 
-      self.assertEqual(sp_concat.indices.get_shape().as_list(), [None, 3])
-      self.assertEqual(sp_concat.values.get_shape().as_list(), [None])
-      self.assertEqual(sp_concat.shape.get_shape(), [3])
+        self.assertEqual(sp_concat.indices.get_shape().as_list(), [None, 3])
+        self.assertEqual(sp_concat.values.get_shape().as_list(), [None])
+        self.assertEqual(sp_concat.shape.get_shape(), [3])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
index d67d5b7f9f0..2b2ae4a7f97 100644
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@@ -72,18 +72,28 @@ class SparseXentTest(tf.test.TestCase):
         [1., 2., 3., 4.],
         [1., 2., 3., 4.]]
     labels = [4, 3, 0, -1]
-    with self.test_session(use_gpu=True) as sess:
-      loss, backprop = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
-          features, labels)
-      tf_loss, tf_backprop = sess.run([loss, backprop])
-      self.assertAllClose(
-          [[np.nan] * 4,
-           [0.25, 0.25, 0.25, -0.75],
-           [-0.968, 0.087, 0.237, 0.6439],
-           [np.nan] * 4],
-          tf_backprop, rtol=1e-3, atol=1e-3)
-      self.assertAllClose(
-          [np.nan, 1.3862, 3.4420, np.nan], tf_loss, rtol=1e-3, atol=1e-3)
+
+    if tf.test.is_built_with_cuda() and tf.test.is_gpu_available():
+      with self.test_session(use_gpu=True) as sess:
+        loss, backprop = (
+            gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
+                features, labels))
+        tf_loss, tf_backprop = sess.run([loss, backprop])
+        self.assertAllClose(
+            [[np.nan] * 4,
+             [0.25, 0.25, 0.25, -0.75],
+             [-0.968, 0.087, 0.237, 0.6439],
+             [np.nan] * 4],
+            tf_backprop, rtol=1e-3, atol=1e-3)
+        self.assertAllClose(
+            [np.nan, 1.3862, 3.4420, np.nan], tf_loss, rtol=1e-3, atol=1e-3)
+
+    with self.test_session(use_gpu=False) as sess:
+      loss, backprop = (
+          gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
+              features, labels))
+      with self.assertRaisesOpError("Received a label value of"):
+        sess.run([loss, backprop])
 
   def testNpXent(self):
     # We create 2 batches of logits for testing.
diff --git a/tensorflow/python/lib/io/file_io.i b/tensorflow/python/lib/io/file_io.i
index e4a9aa94c52..8eb474b7543 100644
--- a/tensorflow/python/lib/io/file_io.i
+++ b/tensorflow/python/lib/io/file_io.i
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/inputstream_interface.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
-#include "tensorflow/core/lib/io/match.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_statistics.h"
@@ -70,9 +69,8 @@ void WriteStringToFile(const string& filename, const string& file_content,
 std::vector<string> GetMatchingFiles(const string& filename,
                                      TF_Status* out_status) {
   std::vector<string> results;
-  tensorflow::Status status =
-      tensorflow::io::GetMatchingFiles(tensorflow::Env::Default(), filename,
-          &results);
+  tensorflow::Status status = tensorflow::Env::Default()->GetMatchingPaths(
+      filename, &results);
   if (!status.ok()) {
     Set_TF_Status_from_Status(out_status, status);
   }
diff --git a/tensorflow/python/lib/io/py_record_reader.cc b/tensorflow/python/lib/io/py_record_reader.cc
index 47c0878932c..d3f557506e0 100644
--- a/tensorflow/python/lib/io/py_record_reader.cc
+++ b/tensorflow/python/lib/io/py_record_reader.cc
@@ -43,14 +43,9 @@ PyRecordReader* PyRecordReader::New(const string& filename, uint64 start_offset,
   reader->offset_ = start_offset;
   reader->file_ = file.release();
 
-  RecordReaderOptions options;
-  if (compression_type_string == "ZLIB") {
-    options.compression_type = RecordReaderOptions::ZLIB_COMPRESSION;
-    options.zlib_options = ZlibCompressionOptions::DEFAULT();
-  } else if (compression_type_string == "GZIP") {
-    options.compression_type = RecordReaderOptions::ZLIB_COMPRESSION;
-    options.zlib_options = ZlibCompressionOptions::GZIP();
-  }
+  RecordReaderOptions options =
+      RecordReaderOptions::CreateRecordReaderOptions(compression_type_string);
+
   reader->reader_ = new RecordReader(reader->file_, options);
   return reader;
 }
diff --git a/tensorflow/python/lib/io/py_record_writer.cc b/tensorflow/python/lib/io/py_record_writer.cc
index d9fdda7ebfa..039e59756ec 100644
--- a/tensorflow/python/lib/io/py_record_writer.cc
+++ b/tensorflow/python/lib/io/py_record_writer.cc
@@ -39,14 +39,9 @@ PyRecordWriter* PyRecordWriter::New(const string& filename,
   PyRecordWriter* writer = new PyRecordWriter;
   writer->file_ = file.release();
 
-  RecordWriterOptions options;
-  if (compression_type_string == "ZLIB") {
-    options.compression_type = RecordWriterOptions::ZLIB_COMPRESSION;
-    options.zlib_options = ZlibCompressionOptions::DEFAULT();
-  } else if (compression_type_string == "GZIP") {
-    options.compression_type = RecordWriterOptions::ZLIB_COMPRESSION;
-    options.zlib_options = ZlibCompressionOptions::GZIP();
-  }
+  RecordWriterOptions options =
+      RecordWriterOptions::CreateRecordWriterOptions(compression_type_string);
+
   writer->writer_ = new RecordWriter(writer->file_, options);
   return writer;
 }
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 96b212c8ade..c07ff5c2d3d 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -33,17 +33,21 @@ class TFRecordCompressionType(object):
 # NOTE(vrv): This will eventually be converted into a proto.  to match
 # the interface used by the C++ RecordWriter.
 class TFRecordOptions(object):
+  """Options used for manipulating TFRecord files."""
+  compression_type_map = {
+      TFRecordCompressionType.ZLIB: "ZLIB",
+      TFRecordCompressionType.GZIP: "GZIP",
+      TFRecordCompressionType.NONE: ""
+  }
 
   def __init__(self, compression_type):
     self.compression_type = compression_type
 
-  def get_type_as_string(self):
-    if self.compression_type == TFRecordCompressionType.ZLIB:
-      return "ZLIB"
-    elif self.compression_type == TFRecordCompressionType.GZIP:
-      return "GZIP"
-    else:
+  @classmethod
+  def get_compression_type_string(cls, options):
+    if not options:
       return ""
+    return cls.compression_type_map[options.compression_type]
 
 
 def tf_record_iterator(path, options=None):
@@ -59,11 +63,10 @@ def tf_record_iterator(path, options=None):
   Raises:
     IOError: If `path` cannot be opened for reading.
   """
-  compression_type_string = options.get_type_as_string() if options else ""
+  compression_type = TFRecordOptions.get_compression_type_string(options)
   with errors.raise_exception_on_not_ok_status() as status:
     reader = pywrap_tensorflow.PyRecordReader_New(
-        compat.as_bytes(path), 0, compat.as_bytes(compression_type_string),
-        status)
+        compat.as_bytes(path), 0, compat.as_bytes(compression_type), status)
 
   if reader is None:
     raise IOError("Could not open %s." % path)
@@ -94,12 +97,11 @@ class TFRecordWriter(object):
     Raises:
       IOError: If `path` cannot be opened for writing.
     """
-    compression_type_string = options.get_type_as_string() if options else ""
+    compression_type = TFRecordOptions.get_compression_type_string(options)
 
     with errors.raise_exception_on_not_ok_status() as status:
       self._writer = pywrap_tensorflow.PyRecordWriter_New(
-          compat.as_bytes(path), compat.as_bytes(compression_type_string),
-          status)
+          compat.as_bytes(path), compat.as_bytes(compression_type), status)
 
   def __enter__(self):
     """Enter a `with` block."""
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 465fda95d7b..44e28c5b5d1 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -423,6 +423,13 @@ def _SpaceToBatchGrad(op, grad):
           None]
 
 
+@ops.RegisterGradient("SpaceToBatchND")
+def _SpaceToBatchNDGrad(op, grad):
+  # Its gradient is the opposite op: BatchToSpaceND.
+  return [array_ops.batch_to_space_nd(grad, op.inputs[1], op.inputs[2]),
+          None, None]
+
+
 @ops.RegisterGradient("BatchToSpace")
 def _BatchToSpaceGrad(op, grad):
   # Its gradient is the opposite op: SpaceToBatch.
@@ -431,6 +438,13 @@ def _BatchToSpaceGrad(op, grad):
           None]
 
 
+@ops.RegisterGradient("BatchToSpaceND")
+def _BatchToSpaceNDGrad(op, grad):
+  # Its gradient is the opposite op: SpaceToBatchND.
+  return [array_ops.space_to_batch_nd(grad, op.inputs[1], op.inputs[2]),
+          None, None]
+
+
 @ops.RegisterGradient("SpaceToDepth")
 def _SpaceToDepthGrad(op, grad):
   # Its gradient is the opposite op: DepthToSpace.
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a6aeab1b843..9920c003fce 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -25,6 +25,7 @@ types in your graph.
 @@to_int32
 @@to_int64
 @@cast
+@@bitcast
 @@saturate_cast
 
 ## Shapes and Shaping
@@ -33,6 +34,7 @@ TensorFlow provides several operations that you can use to determine the shape
 of a tensor and change the shape of a tensor.
 
 @@shape
+@@shape_n
 @@size
 @@rank
 @@reshape
@@ -57,16 +59,21 @@ or join multiple tensors together.
 @@reverse
 @@transpose
 @@extract_image_patches
+@@space_to_batch_nd
 @@space_to_batch
+@@required_space_to_batch_paddings
+@@batch_to_space_nd
 @@batch_to_space
 @@space_to_depth
 @@depth_to_space
 @@gather
 @@gather_nd
+@@unique_with_counts
 @@dynamic_partition
 @@dynamic_stitch
 @@boolean_mask
 @@one_hot
+@@sequence_mask
 
 """
 from __future__ import absolute_import
@@ -1909,16 +1916,133 @@ def _QuantizeDequantizeShape(op):
 ops.RegisterShape("ExtractImagePatches")(common_shapes.call_cpp_shape_fn)
 
 
+def required_space_to_batch_paddings(input_shape,
+                                     block_shape,
+                                     base_paddings=None,
+                                     name=None):
+  """Calculate padding required to make block_shape divide input_shape.
+
+  This function can be used to calculate a suitable paddings argument for use
+  with space_to_batch_nd and batch_to_space_nd.
+
+  Args:
+    input_shape: int32 Tensor of shape [N].
+    block_shape: int32 Tensor of shape [N].
+    base_paddings: Optional int32 Tensor of shape [N, 2].  Specifies the minimum
+      amount of padding to use.  All elements must be >= 0.  If not specified,
+      defaults to 0.
+    name: string.  Optional name prefix.
+
+  Returns:
+    (paddings, crops), where:
+
+    `paddings` and `crops` are int32 Tensors of rank 2 and shape [N, 2]
+    satisfying:
+
+        paddings[i, 0] = base_paddings[i, 0].
+        0 <= paddings[i, 1] - base_paddings[i, 1] < block_shape[i]
+        (input_shape[i] + paddings[i, 0] + paddings[i, 1]) % block_shape[i] == 0
+
+        crops[i, 0] = 0
+        crops[i, 1] = paddings[i, 1] - base_paddings[i, 1]
+
+  Raises: ValueError if called with incompatible shapes.
+  """
+  with ops.name_scope(name, "required_space_to_batch_paddings",
+                      [input_shape, block_shape]):
+    input_shape = ops.convert_to_tensor(input_shape,
+                                        dtype=dtypes.int32,
+                                        name="input_shape")
+    block_shape = ops.convert_to_tensor(block_shape,
+                                        dtype=dtypes.int32,
+                                        name="block_shape")
+
+    block_shape.get_shape().assert_is_fully_defined()
+    block_shape.get_shape().assert_has_rank(1)
+    num_block_dims = block_shape.get_shape()[0].value
+    if num_block_dims == 0:
+      return zeros([0, 2], dtypes.int32), zeros([0, 2], dtypes.int32)
+
+    input_shape.get_shape().assert_is_compatible_with([num_block_dims])
+
+    if base_paddings is not None:
+      base_paddings = ops.convert_to_tensor(base_paddings,
+                                            dtype=dtypes.int32,
+                                            name="base_paddings")
+      base_paddings.get_shape().assert_is_compatible_with([num_block_dims, 2])
+    else:
+      base_paddings = zeros([num_block_dims, 2], dtypes.int32)
+
+    const_block_shape = tensor_util.constant_value(block_shape)
+    const_input_shape = tensor_util.constant_value(input_shape)
+    const_base_paddings = tensor_util.constant_value(base_paddings)
+    if (const_block_shape is not None and const_input_shape is not None and
+        const_base_paddings is not None):
+      block_shape = const_block_shape
+      input_shape = const_input_shape
+      base_paddings = const_base_paddings
+
+    # Use same expression for both constant and non-constant case.
+    pad_start = base_paddings[:, 0]
+    orig_pad_end = base_paddings[:, 1]
+    full_input_shape = input_shape + pad_start + orig_pad_end
+    pad_end_extra = (block_shape - full_input_shape % block_shape) % block_shape
+    pad_end = orig_pad_end + pad_end_extra
+
+    result_paddings = pack(
+        [[pad_start[i], pad_end[i]] for i in range(num_block_dims)],
+        name="paddings")
+    result_crops = pack(
+        [[0, pad_end_extra[i]] for i in range(num_block_dims)], name="crops")
+    return result_paddings, result_crops
+
+
+def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=redefined-builtin
+  result = space_to_batch_nd(input,
+                             paddings=paddings,
+                             block_shape=np.array([block_size, block_size],
+                                                  dtype=np.int64),
+                             name=name)
+  result.set_shape(result.get_shape().with_rank(4))
+  return result
+
+
+space_to_batch.__doc__ = gen_array_ops._space_to_batch.__doc__
+
+
+def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=redefined-builtin
+  result = batch_to_space_nd(input,
+                             crops=crops,
+                             block_shape=np.array([block_size, block_size],
+                                                  dtype=np.int64),
+                             name=name)
+  result.set_shape(result.get_shape().with_rank(4))
+  return result
+
+
+batch_to_space.__doc__ = gen_array_ops._batch_to_space.__doc__
+
+
 @ops.RegisterShape("SpaceToBatch")
 def _SpaceToBatchShape(op):
   return common_shapes.call_cpp_shape_fn(op, input_tensors_needed=[1])
 
 
+@ops.RegisterShape("SpaceToBatchND")
+def _SpaceToBatchNDShape(op):
+  return common_shapes.call_cpp_shape_fn(op, input_tensors_needed=[1, 2])
+
+
 @ops.RegisterShape("BatchToSpace")
 def _BatchToSpaceShape(op):
   return common_shapes.call_cpp_shape_fn(op, input_tensors_needed=[1])
 
 
+@ops.RegisterShape("BatchToSpaceND")
+def _BatchToSpaceNDShape(op):
+  return common_shapes.call_cpp_shape_fn(op, input_tensors_needed=[1, 2])
+
+
 ops.RegisterShape("SpaceToDepth")(common_shapes.call_cpp_shape_fn)
 ops.RegisterShape("DepthToSpace")(common_shapes.call_cpp_shape_fn)
 
@@ -2114,3 +2238,50 @@ def _PlaceholderWithDefaultShape(op):
   # may be *less* precise than `input_shape`.
   input_shape.assert_is_compatible_with(output_shape)
   return [output_shape]
+
+
+def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
+  """Return a mask tensor representing the first N positions of each row.
+
+  Example:
+  ```python
+  tf.sequence_mask([1, 3, 2], 5) =
+    [[True, False, False, False, False],
+     [True, True, True, False, False],
+     [True, True, False, False, False]]
+  ```
+
+  Args:
+    lengths: 1D integer tensor, all its values < maxlen.
+    maxlen: scalar integer tensor, maximum length of each row. Default: use
+            maximum over lengths.
+    dtype: output type of the resulting tensor.
+    name: name of the op.
+  Returns:
+    A 2D mask tensor, as shown in the example above, cast to specified dtype.
+
+  Raises:
+    ValueError: if the arguments have invalid rank.
+  """
+  with ops.name_scope(name, "SequenceMask", [lengths, maxlen]):
+    lengths = ops.convert_to_tensor(lengths)
+    if lengths.get_shape().ndims != 1:
+      raise ValueError("lengths must be 1D for sequence_mask")
+
+    if maxlen is None:
+      maxlen = gen_math_ops._max(lengths, [0])
+    else:
+      maxlen = ops.convert_to_tensor(maxlen)
+    if maxlen.get_shape().ndims != 0:
+      raise ValueError("maxlen must be scalar for sequence_mask")
+
+    # The basic idea is to compare a range row vector of size maxlen:
+    # [0, 1, 2, 3, 4]
+    # to length as a matrix with 1 column: [[1], [3], [2]].
+    # Because of broadcasting on both arguments this comparison results
+    # in a matrix of size (len(lengths), maxlen)
+    result = gen_math_ops._range(0, maxlen, 1) < expand_dims(lengths, 1)
+    if dtype is None or result.dtype.base_dtype == dtype.base_dtype:
+      return result
+    else:
+      return gen_math_ops.cast(result, dtype)
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 70e6b46d6aa..1c444de510c 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -312,7 +312,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
   @classmethod
   def _GetFunc(cls, **kwargs):
-    return function.Defun(x=tf.float32, b=tf.float32, **kwargs)(
+    return function.Defun(tf.float32, tf.float32, **kwargs)(
         cls.XSquarePlusB)
 
   def _GetFuncGradients(self, f, x_value, b_value):
@@ -351,7 +351,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
   def testFunctionGradientsWithGradFunc(self):
     g = ops.Graph()
     with g.as_default():
-      grad_func = function.Defun(x=tf.float32, b=tf.float32, g=tf.float32)(
+      grad_func = function.Defun(tf.float32, tf.float32, tf.float32)(
           self.XSquarePlusBGradient)
       f = self._GetFunc(grad_func=grad_func)
       # Get gradients (should add SymbolicGradient node for function, which
@@ -373,7 +373,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
   def testFunctionGradientWithGradFuncAndRegistration(self):
     g = ops.Graph()
     with g.as_default():
-      grad_func = function.Defun(x=tf.float32, b=tf.float32, g=tf.float32)(
+      grad_func = function.Defun(tf.float32, tf.float32, tf.float32)(
           self.XSquarePlusBGradient)
       with self.assertRaisesRegexp(ValueError, "Gradient defined twice"):
         f = self._GetFunc(grad_func=grad_func,
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index ab0d0923a06..fb2929620e7 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -1,4 +1,5 @@
 # array_ops
+BatchToSpace
 BroadcastGradientArgs
 ConcatOffset
 Concat
@@ -11,6 +12,7 @@ Pack
 Pad
 Placeholder
 RefIdentity
+SpaceToBatch
 Split
 Slice
 TileGrad  # Exported through array_grad instead of array_ops.
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 6cd89bf81ee..e1f0ba51f8f 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -273,6 +273,8 @@ def uniform_unit_scaling_initializer(factor=1.0,
     # is the right thing for matrix multiply and convolutions (see above).
     for dim in scale_shape[:-1]:
       input_size *= float(dim)
+    # Avoid errors when initializing zero-size tensors.
+    input_size = max(input_size, 1.0)
     max_val = math.sqrt(3 / input_size) * factor
     return random_ops.random_uniform(shape, -max_val, max_val,
                                      dtype, seed=seed)
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index 2c34c7ba27c..5187242ebe5 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -464,13 +464,11 @@ class TFRecordReader(ReaderBase):
       name: A name for the operation (optional).
       options: A TFRecordOptions object (optional).
     """
-    compression_type_string = ""
-    if (options and
-        options.compression_type == python_io.TFRecordCompressionType.ZLIB):
-      compression_type_string = "ZLIB"
+    compression_type = python_io.TFRecordOptions.get_compression_type_string(
+        options)
 
-    rr = gen_io_ops._tf_record_reader(name=name,
-                                      compression_type=compression_type_string)
+    rr = gen_io_ops._tf_record_reader(
+        name=name, compression_type=compression_type)
     super(TFRecordReader, self).__init__(rr)
 
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 444651a58a8..d5f51dee71e 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -24,6 +24,7 @@ operators to your graph.
 @@add
 @@sub
 @@mul
+@@scalar_mul
 @@div
 @@truediv
 @@floordiv
@@ -1525,6 +1526,9 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   Optionally, pass `shape` and `tensor_dtype` for shape and type checking,
   otherwise, these are inferred.
 
+  NOTE: This operation is not differentiable and cannot be used if inputs depend
+  on trainable variables. Please use tf.add_n for such cases.
+
   For example:
 
   ```python
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 6d72d27ac9a..d55eced633c 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -560,7 +560,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
 
 
 def softmax(logits, dim=-1, name=None):
-  """Computes softmax activations.
+  """Computes log softmax activations.
 
   For each batch `i` and class `j` we have
 
@@ -587,7 +587,7 @@ def log_softmax(logits, dim=-1, name=None):
 
   For each batch `i` and class `j` we have
 
-      logsoftmax = logits - log(reduce_sum(exp(logits), dim))
+      logsoftmax = logits - reduce_sum(exp(logits), dim)
 
   Args:
     logits: A non-empty `Tensor`. Must be one of the following types: `half`,
@@ -716,12 +716,14 @@ def sparse_softmax_cross_entropy_with_logits(logits, labels, name=None):
   labels of shape `[batch_size]`. But higher dimensions are supported.
 
   Args:
+
     logits: Unscaled log probabilities of rank `r` and shape
       `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
     labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
       `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
-      Other values will result in a loss of 0, but incorrect gradient
-      computations.
+      Other values will raise an exception when this op is run on CPU, and
+      return `NaN` for corresponding corresponding loss and gradient rows
+      on GPU.
     name: A name for the operation (optional).
 
   Returns:
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 5121cf58470..ec623748e27 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -72,7 +72,7 @@ def rnn(cell, inputs, initial_state=None, dtype=None,
   """Creates a recurrent neural network specified by RNNCell `cell`.
 
   The simplest form of RNN network generated is:
-  ```py
+  ```python
     state = cell.zero_state(...)
     outputs = []
     for input_ in inputs:
@@ -89,11 +89,13 @@ def rnn(cell, inputs, initial_state=None, dtype=None,
   and properly propagates the state at an example's sequence length
   to the final state output.
 
-  The dynamic calculation performed is, at time t for batch row b,
+  The dynamic calculation performed is, at time `t` for batch row `b`,
+  ```python
     (output, state)(b, t) =
       (t >= sequence_length(b))
         ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
         : cell(input(b, t), state(b, t - 1))
+  ```
 
   Args:
     cell: An instance of RNNCell.
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index ce5129edba3..b09f23a7d50 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -17,6 +17,8 @@
 TensorFlow provides allows you to wrap python/numpy functions as
 TensorFlow operators.
 
+@@py_func
+
 """
 
 # pylint: disable=g-bad-name
@@ -132,8 +134,8 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   Args:
     func: A python function.
     inp: A list of `Tensor`.
-    Tout: A list of tensorflow data types or a single tensorflow data type
-          indicating what `func` returns.
+    Tout: A list or tuple of tensorflow data types or a single tensorflow data
+          type if there is only one, indicating what `func` returns.
     stateful: A boolean indicating whether the function should be considered
               stateful or stateless. I.e. whether it, given the same input, will
               return the same output and at the same time does not change state
@@ -162,20 +164,20 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   # the funcs registry.
   g._cleanup_py_funcs_used_in_graph.append(cleanup)
 
-  if isinstance(Tout, list):
-    is_list = True
+  if isinstance(Tout, (list, tuple)):
+    is_list_or_tuple = True
   else:
     Tout = [Tout]
-    is_list = False
+    is_list_or_tuple = False
   if stateful:
     result = gen_script_ops._py_func(
-            input=inp, token=token, Tout=Tout, name=name)
+        input=inp, token=token, Tout=Tout, name=name)
     # pylint: enable=protected-access
   else:
     result = gen_script_ops._py_func_stateless(
         input=inp, token=token, Tout=Tout, name=name)
     # pylint: enable=protected-access
-  return result if is_list else result[0]
+  return result if is_list_or_tuple else result[0]
 
 
 ops.RegisterShape("PyFunc")(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 3230e11464f..05ad3082d3c 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -196,7 +196,8 @@ def sparse_concat(concat_dim, sp_inputs, name=None, expand_nonconcat_dim=False):
 
 
   Args:
-    concat_dim: Dimension to concatenate along.
+    concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+      where rank is the number of dimensions in each input `SparseTensor`.
     sp_inputs: List of `SparseTensor` to concatenate.
     name: A name prefix for the returned tensors (optional).
     expand_nonconcat_dim: Whether to allow the expansion in the non-concat
@@ -220,10 +221,11 @@ def sparse_concat(concat_dim, sp_inputs, name=None, expand_nonconcat_dim=False):
   if expand_nonconcat_dim:
     max_shape = math_ops.reduce_max(array_ops.concat(0, [array_ops.reshape(
         shape, [1, -1]) for shape in shapes]), 0)
-    shapes = [array_ops.concat(0, [max_shape[:concat_dim],
-                                   shape[concat_dim:concat_dim + 1],
-                                   max_shape[concat_dim + 1:]])
-              for shape in shapes]
+    shapes = [array_ops.concat(0, [
+        max_shape[:concat_dim],
+        shape[-1:] if concat_dim == -1 else shape[concat_dim:concat_dim + 1],
+        [] if concat_dim == -1 else max_shape[concat_dim + 1:]
+    ]) for shape in shapes]
 
   output_ind, output_val, output_shape = (
       gen_sparse_ops._sparse_concat(inds,
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index ad8463a30c4..03f103ce3b2 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -25,6 +25,7 @@ collected in the graph.
 @@all_variables
 @@trainable_variables
 @@local_variables
+@@model_variables
 @@moving_average_variables
 
 @@initialize_all_variables
@@ -34,6 +35,10 @@ collected in the graph.
 @@report_uninitialized_variables
 @@assert_variables_initialized
 
+@@assign
+@@assign_add
+@@assign_sub
+
 ## Saving and Restoring Variables
 
 @@Saver
diff --git a/tensorflow/python/platform/flags_test.py b/tensorflow/python/platform/flags_test.py
index 39d92bd399c..bc2d012d039 100644
--- a/tensorflow/python/platform/flags_test.py
+++ b/tensorflow/python/platform/flags_test.py
@@ -19,9 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import sys
+import unittest
 
-from tensorflow.python.platform import googletest
-
+from tensorflow.python.platform import app
 from tensorflow.python.platform import flags
 
 
@@ -39,7 +39,7 @@ flags.DEFINE_bool("bool_e", True, "HelpString")
 
 FLAGS = flags.FLAGS
 
-class FlagsTest(googletest.TestCase):
+class FlagsTest(unittest.TestCase):
 
   def testString(self):
     res = FLAGS.string_foo
@@ -65,16 +65,9 @@ class FlagsTest(googletest.TestCase):
     # --bool_flag=True sets to True
     self.assertEqual(True, FLAGS.bool_c)
 
-    # --no before the flag mirrors argparse's behavior with
-    # regard to dashes in flag names
-    self.assertEqual(False, FLAGS.bool_dash_negation)
-
     # --bool_flag=False sets to False
     self.assertEqual(False, FLAGS.bool_d)
 
-    # --bool_flag=gibberish sets to False
-    self.assertEqual(False, FLAGS.bool_e)
-
   def testInt(self):
     res = FLAGS.int_foo
     self.assertEquals(res, 42)
@@ -88,14 +81,18 @@ class FlagsTest(googletest.TestCase):
     self.assertEqual(-1.0, FLAGS.float_foo)
 
 
+def main(_):
+  # unittest.main() tries to interpret the unknown flags, so use the
+  # direct functions instead.
+  runner = unittest.TextTestRunner()
+  itersuite = unittest.TestLoader().loadTestsFromTestCase(FlagsTest)
+  runner.run(itersuite)
+
+
 if __name__ == "__main__":
   # Test command lines
-  sys.argv.extend(["--bool_a", "--nobool_negation", "--nobool-dash-negation",
-                   "--bool_c=True", "--bool_d=False", "--bool_e=gibberish",
+  sys.argv.extend(["--bool_a", "--nobool_negation",
+                   "--bool_c=True", "--bool_d=False",
                    "--unknown_flag", "and_argument"])
 
-  # googletest.main() tries to interpret the above flags, so use the
-  # direct functions instead.
-  runner = googletest.TextTestRunner()
-  itersuite = googletest.TestLoader().loadTestsFromTestCase(FlagsTest)
-  runner.run(itersuite)
+  app.run()
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 4fdd4d37720..3d7e9e810ca 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -2,10 +2,7 @@
 # TensorFlow SavedModel.
 
 package(
-    default_visibility = ["//visibility:private"],
-    features = [
-        "-layering_check",
-    ],
+    default_visibility = ["//tensorflow/python/saved_model:__subpackages__"],
 )
 
 licenses(["notice"])  # Apache 2.0
@@ -24,7 +21,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/session_bundle:manifest_proto_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
@@ -59,7 +55,6 @@ py_test(
         ":loader",
         ":utils",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/session_bundle:manifest_proto_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/saved_model/builder.py b/tensorflow/python/saved_model/builder.py
index 8076a7911d6..f17227a2a29 100644
--- a/tensorflow/python/saved_model/builder.py
+++ b/tensorflow/python/saved_model/builder.py
@@ -26,7 +26,7 @@ import os
 
 from google.protobuf.any_pb2 import Any
 
-from tensorflow.contrib.session_bundle import manifest_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -123,12 +123,12 @@ class SavedModelBuilder(object):
 
     Args:
       asset_filename: The filename of the asset to be added.
-      asset_tensor: The asset tensor used to populate the tensor binding of the
+      asset_tensor: The asset tensor used to populate the tensor info of the
           asset proto.
     """
-    asset_proto = manifest_pb2.AssetFile()
+    asset_proto = meta_graph_pb2.AssetFileDef()
     asset_proto.filename = asset_filename
-    asset_proto.tensor_binding.tensor_name = asset_tensor.name
+    asset_proto.tensor_info.name = asset_tensor.name
 
     asset_any_proto = Any()
     asset_any_proto.Pack(asset_proto)
diff --git a/tensorflow/python/saved_model/example/BUILD b/tensorflow/python/saved_model/example/BUILD
new file mode 100644
index 00000000000..d49d2cb872d
--- /dev/null
+++ b/tensorflow/python/saved_model/example/BUILD
@@ -0,0 +1,40 @@
+# Description: SavedModel half plus two example.
+
+package(
+    default_visibility = ["//tensorflow/python/saved_model:__subpackages__"],
+    features = [
+        "-layering_check",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "g3doc/sitemap.md",
+        ],
+    ),
+    visibility = ["//visibility:public"],
+)
+
+py_binary(
+    name = "saved_model_half_plus_two",
+    srcs = [
+        "saved_model_half_plus_two.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:constants",
+        "//tensorflow/python/saved_model:utils",
+    ],
+)
diff --git a/tensorflow/python/saved_model/example/saved_model_half_plus_two.py b/tensorflow/python/saved_model/example/saved_model_half_plus_two.py
new file mode 100644
index 00000000000..8889f7f04d3
--- /dev/null
+++ b/tensorflow/python/saved_model/example/saved_model_half_plus_two.py
@@ -0,0 +1,84 @@
+## Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exports an example linear regression inference graph.
+
+Exports a TensorFlow graph to /tmp/saved_model/half_plus_two/ based on the
+SavedModel format.
+
+This graph calculates,
+  y = a*x + b
+where a and b are variables with a=0.5 and b=2.
+
+Output from this program is typically used to exercise SavedModel load and
+execution code.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.saved_model import builder as saved_model_builder
+from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import utils
+
+
+def _generate_saved_model_for_half_plus_two(export_dir):
+  """Generates SavedModel for half plus two.
+
+  Args:
+    export_dir: The directory to which the SavedModel should be written.
+  """
+  builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+  with tf.Session(graph=tf.Graph()) as sess:
+    # Set up the model parameters as variables to exercise variable loading
+    # functionality upon restore.
+    a = tf.Variable(0.5, name="a")
+    b = tf.Variable(2.0, name="b")
+
+    # Set up placeholders.
+    x = tf.placeholder(tf.float32, name="x")
+    y = tf.add(tf.mul(a, x), b, name="y")
+
+    # Set up the signature for regression with input and output tensor
+    # specification.
+    input_tensor = meta_graph_pb2.TensorInfo()
+    input_tensor.name = x.name
+    signature_inputs = {"input": input_tensor}
+
+    output_tensor = meta_graph_pb2.TensorInfo()
+    output_tensor.name = y.name
+    signature_outputs = {"output": output_tensor}
+    signature_def = utils.build_signature_def(signature_inputs,
+                                              signature_outputs, "regression")
+
+    # Initialize all variables and then save the SavedModel.
+    sess.run(tf.initialize_all_variables())
+    builder.add_meta_graph_and_variables(
+        sess, [constants.TAG_SERVING],
+        signature_def_map={"regression": signature_def})
+    builder.save()
+
+
+def main(_):
+  export_dir = "/tmp/saved_model/half_plus_two"
+  _generate_saved_model_for_half_plus_two(export_dir)
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 8b58c1ec456..0e33250e28b 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 import os
 import tensorflow as tf
 
-from tensorflow.contrib.session_bundle import manifest_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import builder as saved_model_builder
@@ -363,7 +363,7 @@ class SavedModelTest(tf.test.TestCase):
       collection_def = foo_graph.collection_def
       assets_any = collection_def[constants.ASSETS_KEY].any_list.value
       self.assertEqual(len(assets_any), 1)
-      asset = manifest_pb2.AssetFile()
+      asset = meta_graph_pb2.AssetFileDef()
       assets_any[0].Unpack(asset)
       assets_path = os.path.join(
           compat.as_bytes(export_dir),
@@ -372,7 +372,7 @@ class SavedModelTest(tf.test.TestCase):
       asset_contents = file_io.read_file_to_string(assets_path)
       self.assertEqual("foo bar baz", compat.as_text(asset_contents))
       self.assertEqual("hello42.txt", asset.filename)
-      self.assertEqual("asset_file_tensor:0", asset.tensor_binding.tensor_name)
+      self.assertEqual("asset_file_tensor:0", asset.tensor_info.name)
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
diff --git a/tensorflow/python/summary/event_multiplexer.py b/tensorflow/python/summary/event_multiplexer.py
index 56ead9f8edd..5131b0612da 100644
--- a/tensorflow/python/summary/event_multiplexer.py
+++ b/tensorflow/python/summary/event_multiplexer.py
@@ -70,6 +70,7 @@ class EventMultiplexer(object):
   @@AddRunsFromDirectory
   @@Reload
   @@Runs
+  @@RunPaths
   @@Scalars
   @@Graph
   @@Histograms
@@ -357,6 +358,10 @@ class EventMultiplexer(object):
       items = list(six.iteritems(self._accumulators))
     return {run_name: accumulator.Tags() for run_name, accumulator in items}
 
+  def RunPaths(self):
+    """Returns a dict mapping run names to event file paths."""
+    return self._paths
+
   def _GetAccumulator(self, run):
     with self._accumulators_mutex:
       return self._accumulators[run]
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index cbc48961239..f29ecc5b6aa 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -25,6 +25,8 @@ import tensorflow as tf
 FLAGS = tf.app.flags.FLAGS
 tf.app.flags.DEFINE_string("file_name", "", "Checkpoint filename")
 tf.app.flags.DEFINE_string("tensor_name", "", "Name of the tensor to inspect")
+tf.app.flags.DEFINE_bool("all_tensors", "False",
+                         "If True, print the values of all the tensors.")
 
 
 def print_tensors_in_checkpoint_file(file_name, tensor_name):
@@ -41,7 +43,12 @@ def print_tensors_in_checkpoint_file(file_name, tensor_name):
   """
   try:
     reader = tf.train.NewCheckpointReader(file_name)
-    if not tensor_name:
+    if FLAGS.all_tensors:
+      var_to_shape_map = reader.get_variable_to_shape_map()
+      for key in var_to_shape_map:
+        print("tensor_name: ", key)
+        print(reader.get_tensor(key))
+    elif not tensor_name:
       print(reader.debug_string().decode("utf-8"))
     else:
       print("tensor_name: ", tensor_name)
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index ee5665cf0ff..b1b40061468 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1335,10 +1335,11 @@ class Saver(object):
 
     # Performs this check only for V1, as the V2 restore op can read either a
     # V1 ckpt or a V2 ckpt, making this check invalid.
-    if (self.saver_def.version is saver_pb2.SaverDef.V1) and (
-        not file_io.get_matching_files(
-            _prefix_to_checkpoint_path(save_path, self.saver_def.version))):
-      raise ValueError("Restore called with invalid save path %s" % save_path)
+    if self.saver_def.version == saver_pb2.SaverDef.V1:
+      file_path = _prefix_to_checkpoint_path(save_path, self.saver_def.version)
+      if not file_io.get_matching_files(file_path):
+        raise ValueError("Restore called with invalid save path: %r. "
+                         "File path is: %r" % (save_path, file_path))
 
     sess.run(self.saver_def.restore_op_name,
              {self.saver_def.filename_tensor_name: save_path})
@@ -1355,7 +1356,7 @@ class Saver(object):
 
 
 def _prefix_to_checkpoint_path(prefix, format_version=saver_pb2.SaverDef.V1):
-  """Yields the pathname of a checkpoint file, given the checkpoint prefix.
+  """Returns the pathname of a checkpoint file, given the checkpoint prefix.
 
   For V1 checkpoint, simply returns the prefix itself (the data file).  For V2,
   returns the pathname to the index file.
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index e3bbf88bfcd..c5923bb5107 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -187,6 +187,14 @@ class SaverTest(tf.test.TestCase):
       self.assertEqual(b"k1", v2_2.keys().eval())
       self.assertEqual(30.0, v2_2.values().eval())
 
+  def testInvalidPath(self):
+    v0 = tf.Variable(0, name="v0")
+    with self.test_session() as sess:
+      save = tf.train.Saver({"v0": v0})
+      with self.assertRaisesRegexp(ValueError,
+                                   "^Restore called with invalid save path.*"):
+        save.restore(sess, "invalid path")
+
   def testInt64(self):
     save_path = os.path.join(self.get_temp_dir(), "int64")
 
@@ -1541,7 +1549,7 @@ class MetaGraphTest(tf.test.TestCase):
       v0 = tf.Variable(0.0)
       var = tf.Variable(10.0)
       tf.add(v0, var)
-      @function.Defun(x=tf.float32)
+      @function.Defun(tf.float32)
       def minus_one(x):
         return x - 1
       minus_one(tf.identity(v0))
diff --git a/tensorflow/python/training/summary_io.py b/tensorflow/python/training/summary_io.py
index 53f80a373b1..90a692f29bb 100644
--- a/tensorflow/python/training/summary_io.py
+++ b/tensorflow/python/training/summary_io.py
@@ -52,6 +52,7 @@ class SummaryWriter(object):
   @@add_event
   @@add_graph
   @@add_run_metadata
+  @@get_logdir
 
   @@flush
   @@close
@@ -113,6 +114,10 @@ class SummaryWriter(object):
       # Calling it with both graph and graph_def for backward compatibility.
       self.add_graph(graph=graph, graph_def=graph_def)
 
+  def get_logdir(self):
+    """Returns the directory where event file will be written."""
+    return self._logdir
+
   def reopen(self):
     """Reopens the summary writer.
 
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index fff50e6f833..2b7e7a976bd 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -13,15 +13,34 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Functions for Python 2 vs. 3 compatibility."""
+"""Functions for Python 2 vs. 3 compatibility.
+
+## Conversion routines
+In addition to the functions below, `as_str` converts an object to a `str`.
+
+@@as_bytes
+@@as_text
+@@as_str_any
+
+## Types
+The compatibility module also provides the following types:
+
+* `bytes_or_text_types`
+* `complex_types`
+* `integral_types`
+* `real_types`
+"""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numbers
+import numbers as _numbers
 
-import numpy as np
-import six
+import numpy as _np
+import six as _six
+
+from tensorflow.python.util.all_util import remove_undocumented
 
 
 def as_bytes(bytes_or_text):
@@ -36,7 +55,7 @@ def as_bytes(bytes_or_text):
   Raises:
     TypeError: If `bytes_or_text` is not a binary or unicode string.
   """
-  if isinstance(bytes_or_text, six.text_type):
+  if isinstance(bytes_or_text, _six.text_type):
     return bytes_or_text.encode('utf-8')
   elif isinstance(bytes_or_text, bytes):
     return bytes_or_text
@@ -57,7 +76,7 @@ def as_text(bytes_or_text):
   Raises:
     TypeError: If `bytes_or_text` is not a binary or unicode string.
   """
-  if isinstance(bytes_or_text, six.text_type):
+  if isinstance(bytes_or_text, _six.text_type):
     return bytes_or_text
   elif isinstance(bytes_or_text, bytes):
     return bytes_or_text.decode('utf-8')
@@ -65,8 +84,8 @@ def as_text(bytes_or_text):
     raise TypeError('Expected binary or unicode string, got %r' % bytes_or_text)
 
 
-# Convert an object to a `str` in both Python 2 and 3
-if six.PY2:
+# Convert an object to a `str` in both Python 2 and 3.
+if _six.PY2:
   as_str = as_bytes
 else:
   as_str = as_text
@@ -89,10 +108,21 @@ def as_str_any(value):
 
 # Numpy 1.8 scalars don't inherit from numbers.Integral in Python 3, so we
 # need to check them specifically.  The same goes from Real and Complex.
-integral_types = (numbers.Integral, np.integer)
-real_types = (numbers.Real, np.integer, np.floating)
-complex_types = (numbers.Complex, np.number)
+integral_types = (_numbers.Integral, _np.integer)
+real_types = (_numbers.Real, _np.integer, _np.floating)
+complex_types = (_numbers.Complex, _np.number)
 
 
-# Either bytes or text
-bytes_or_text_types = (bytes, six.text_type)
+# Either bytes or text.
+bytes_or_text_types = (bytes, _six.text_type)
+
+
+_allowed_symbols = [
+    'as_str',
+    'bytes_or_text_types',
+    'complex_types',
+    'integral_types',
+    'real_types',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/util/future_api.py b/tensorflow/python/util/future_api.py
new file mode 100644
index 00000000000..89e0c3a9840
--- /dev/null
+++ b/tensorflow/python/util/future_api.py
@@ -0,0 +1,37 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Ensure compatibility with future tensorflow versions.
+
+   This ensures that your code will be minimally impacted by future tensorflow
+   API changes. Import the module to prevent accidental usage of stale APIs.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+delattr(tf, 'arg_max')
+delattr(tf, 'arg_min')
+delattr(tf, 'create_partitioned_variables')
+delattr(tf, 'deserialize_many_sparse')
+delattr(tf, 'lin_space')
+delattr(tf, 'list_diff')  # Use tf.listdiff instead.
+delattr(tf, 'parse_single_sequence_example')
+delattr(tf, 'serialize_many_sparse')
+delattr(tf, 'serialize_sparse')
+delattr(tf, 'sparse_matmul')  # Use tf.matmul instead.
diff --git a/tensorflow/python/util/future_api_test.py b/tensorflow/python/util/future_api_test.py
new file mode 100644
index 00000000000..7cafdec6f0a
--- /dev/null
+++ b/tensorflow/python/util/future_api_test.py
@@ -0,0 +1,35 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for future_api."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+# pylint: disable=unused-import
+from tensorflow.python.util import future_api
+# pylint: enable=unused-import
+
+
+class ExampleParserConfigurationTest(tf.test.TestCase):
+
+  def testBasic(self):
+    self.assertFalse(hasattr(tf, 'arg_max'))
+    self.assertTrue(hasattr(tf, 'argmax'))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 67576908325..6c3e4e90e82 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1942,11 +1942,16 @@ bool CudnnSupport::DoConvolveImpl(
   std::unique_ptr<CUDATimer> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));
-    timer->Init();
+    if (!timer->Init()) {
+      return false;
+    }
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    timer->Start(AsCUDAStream(stream));
+    if (!timer->Start(AsCUDAStream(stream))) {
+      timer->Destroy();
+      return false;
+    }
   }
   status = dynload::cudnnConvolutionForward(
       parent_, ToHandle(dnn_handle_),
@@ -1957,7 +1962,10 @@ bool CudnnSupport::DoConvolveImpl(
       /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/&beta,
       /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
   if (is_profiling) {
-    timer->Stop(AsCUDAStream(stream));
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      timer->Destroy();
+      return false;
+    }
     output_profile_result->set_is_valid(true);
     output_profile_result->set_algorithm(algo);
     output_profile_result->set_elapsed_time_in_ms(
diff --git a/tensorflow/tensorboard/BUILD b/tensorflow/tensorboard/BUILD
index e434a7c3f2d..7408db8b9b7 100644
--- a/tensorflow/tensorboard/BUILD
+++ b/tensorflow/tensorboard/BUILD
@@ -46,3 +46,39 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+###### PLUGINS ######
+
+# Plugins don't have their own packages (BUILD files) because we want to
+# have only one BUILD file since each BUILD file needs special rewrite rules
+# in the git world.
+
+py_library(
+    name = "base_plugin",
+    srcs = ["plugins/base_plugin.py"],
+    srcs_version = "PY2AND3",
+)
+
+## Embedding projector ##
+py_library(
+    name = "projector",
+    srcs = glob(["plugins/projector/**/*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_plugin",
+        "//tensorflow/contrib/tensorboard:projector",
+        "//tensorflow/contrib/tensorboard:protos_all_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_library(
+    name = "plugins",
+    srcs = ["plugins/__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        # All registered plugins go in here.
+        ":projector",
+    ],
+)
diff --git a/tensorflow/tensorboard/backend/BUILD b/tensorflow/tensorboard/backend/BUILD
index 8aba2adbf1e..48af218eb88 100644
--- a/tensorflow/tensorboard/backend/BUILD
+++ b/tensorflow/tensorboard/backend/BUILD
@@ -16,6 +16,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
         "//tensorflow/python:util",
+        "//tensorflow/tensorboard:plugins",
         "//tensorflow/tensorboard/lib/python:json_util",
     ],
 )
diff --git a/tensorflow/tensorboard/backend/handler.py b/tensorflow/tensorboard/backend/handler.py
index d39d66bf97c..5a39f039321 100644
--- a/tensorflow/tensorboard/backend/handler.py
+++ b/tensorflow/tensorboard/backend/handler.py
@@ -30,7 +30,7 @@ import json
 import mimetypes
 import os
 import re
-
+import six
 from six import BytesIO
 from six import StringIO
 from six.moves import BaseHTTPServer
@@ -44,10 +44,12 @@ from tensorflow.python.summary import event_accumulator
 from tensorflow.python.util import compat
 from tensorflow.tensorboard.backend import process_graph
 from tensorflow.tensorboard.lib.python import json_util
+from tensorflow.tensorboard.plugins import REGISTERED_PLUGINS
 
 
 DATA_PREFIX = '/data'
 RUNS_ROUTE = '/runs'
+PLUGIN_PREFIX = '/plugin'
 SCALARS_ROUTE = '/' + event_accumulator.SCALARS
 IMAGES_ROUTE = '/' + event_accumulator.IMAGES
 AUDIO_ROUTE = '/' + event_accumulator.AUDIO
@@ -201,7 +203,7 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
     accept_encoding = self.headers.get('Accept-Encoding', '')
     return _ALLOWS_GZIP_PATTERN.search(accept_encoding) is not None
 
-  def _send_gzip_response(self, content, content_type, code=200):
+  def send_gzip_response(self, content, content_type, code=200):
     """Writes the given content as gzip response using the given content type.
 
     If the HTTP client does not accept gzip encoding, then the response will be
@@ -222,7 +224,7 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
       encoding = 'gzip'
     self._respond(content, content_type, code, encoding)
 
-  def _send_json_response(self, obj, code=200):
+  def send_json_response(self, obj, code=200):
     """Writes out the given object as JSON using the given HTTP status code.
 
     This also replaces special float values with stringified versions.
@@ -234,10 +236,10 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
     content = json.dumps(json_util.WrapSpecialFloats(obj))
     self._respond(content, 'application/json', code)
 
-  def _send_csv_response(self, serialized_csv, code=200):
+  def send_csv_response(self, serialized_csv, code=200):
     """Writes out the given string, which represents CSV data.
 
-    Unlike _send_json_response, this does *not* perform the CSV serialization
+    Unlike send_json_response, this does *not* perform the CSV serialization
     for you. It only sets the proper headers.
 
     Args:
@@ -281,9 +283,9 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
       writer = csv.writer(string_io)
       writer.writerow(['Wall time', 'Step', 'Value'])
       writer.writerows(values)
-      self._send_csv_response(string_io.getvalue())
+      self.send_csv_response(string_io.getvalue())
     else:
-      self._send_json_response(values)
+      self.send_json_response(values)
 
   def _serve_graph(self, query_params):
     """Given a single run, return the graph definition in json format."""
@@ -318,7 +320,7 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
     # Serialize the graph to pbtxt format.
     graph_pbtxt = str(graph)
     # Gzip it and send it to the user.
-    self._send_gzip_response(graph_pbtxt, 'text/plain')
+    self.send_gzip_response(graph_pbtxt, 'text/plain')
 
   def _serve_run_metadata(self, query_params):
     """Given a tag and a TensorFlow run, return the session.run() metadata."""
@@ -339,14 +341,14 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
     # Serialize to pbtxt format.
     run_metadata_pbtxt = str(run_metadata)
     # Gzip it and send it to the user.
-    self._send_gzip_response(run_metadata_pbtxt, 'text/plain')
+    self.send_gzip_response(run_metadata_pbtxt, 'text/plain')
 
   def _serve_histograms(self, query_params):
     """Given a tag and single run, return an array of histogram values."""
     tag = query_params.get('tag')
     run = query_params.get('run')
     values = self._multiplexer.Histograms(run, tag)
-    self._send_json_response(values)
+    self.send_json_response(values)
 
   def _serve_compressed_histograms(self, query_params):
     """Given a tag and single run, return an array of compressed histograms."""
@@ -371,9 +373,9 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
         for value in compressed_histogram.compressed_histogram_values:
           row += [value.rank_in_bps, value.value]
         writer.writerow(row)
-      self._send_csv_response(string_io.getvalue())
+      self.send_csv_response(string_io.getvalue())
     else:
-      self._send_json_response(compressed_histograms)
+      self.send_json_response(compressed_histograms)
 
   def _serve_images(self, query_params):
     """Given a tag and list of runs, serve a list of images.
@@ -391,7 +393,7 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
 
     images = self._multiplexer.Images(run, tag)
     response = self._image_response_for_run(images, run, tag)
-    self._send_json_response(response)
+    self.send_json_response(response)
 
   def _serve_image(self, query_params):
     """Serves an individual image."""
@@ -443,7 +445,7 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
 
     audio_list = self._multiplexer.Audio(run, tag)
     response = self._audio_response_for_run(audio_list, run, tag)
-    self._send_json_response(response)
+    self.send_json_response(response)
 
   def _serve_individual_audio(self, query_params):
     """Serves an individual audio clip."""
@@ -499,7 +501,7 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
         logging.warning('Unable to get first event timestamp for run %s',
                         run_name)
         run_data['firstEventTimestamp'] = None
-    self._send_json_response(runs)
+    self.send_json_response(runs)
 
   def _serve_index(self, unused_query_params):
     """Serves the index page (i.e., the tensorboard app itself)."""
@@ -574,6 +576,18 @@ class TensorboardHandler(BaseHTTPServer.BaseHTTPRequestHandler):
         '/app.js': self._serve_js
     }
 
+    # Serve the routes from the registered plugins using their name as the route
+    # prefix. For example if plugin z has two routes /a and /b, they will be
+    # served as /data/plugin/z/a and /data/plugin/z/b.
+    for name in REGISTERED_PLUGINS:
+      plug = REGISTERED_PLUGINS[name]
+      # Initialize the plug by passing the main http handler.
+      plug.initialize(self)
+      plugin_handlers = plug.get_plugin_handlers(self._multiplexer.RunPaths())
+      for route, handler in six.iteritems(plugin_handlers):
+        path = DATA_PREFIX + PLUGIN_PREFIX + '/' + name + route
+        data_handlers[path] = handler
+
     query_params = urlparse.parse_qs(parsed_url.query)
     # parse_qs returns a list of values for each key; we're only interested in
     # the first.
diff --git a/tensorflow/tensorboard/backend/server_test.py b/tensorflow/tensorboard/backend/server_test.py
index 8e564342a26..55f98101e59 100644
--- a/tensorflow/tensorboard/backend/server_test.py
+++ b/tensorflow/tensorboard/backend/server_test.py
@@ -36,9 +36,11 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 from google.protobuf import text_format
+from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import ProjectorConfig
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.summary import event_multiplexer
 from tensorflow.tensorboard.backend import server
+from tensorflow.tensorboard.plugins import REGISTERED_PLUGINS
 
 
 class TensorboardServerTest(tf.test.TestCase):
@@ -181,6 +183,48 @@ class TensorboardServerTest(tf.test.TestCase):
     self.assertEqual(graph.node[1].attr['_very_large_attrs'].list.s,
                      [b'very_large_attr'])
 
+  def testProjectorRunsWithEmbeddings(self):
+    """Test the format of /runs endpoint in projector."""
+    if 'projector' not in REGISTERED_PLUGINS:
+      return
+
+    run_json = self._getJson('/data/plugin/projector/runs')
+
+    self.assertEqual(run_json, ['run1'])
+
+  def testProjectorInfo(self):
+    """Test the format of /info endpoint in projector."""
+    if 'projector' not in REGISTERED_PLUGINS:
+      return
+
+    info_json = self._getJson('/data/plugin/projector/info?run=run1')
+    self.assertEqual(info_json['tensors'], {
+        'var1': {
+            'shape': [1, 2],
+            'name': 'var1',
+            'metadataFile': None
+        },
+        'var2': {
+            'shape': [10, 10],
+            'name': 'var2',
+            'metadataFile': None
+        },
+        'var3': {
+            'shape': [100, 100],
+            'name': 'var3',
+            'metadataFile': None
+        }
+    })
+
+  def testProjectorTensor(self):
+    """Test the format of /tensor endpoint in projector."""
+    if 'projector' not in REGISTERED_PLUGINS:
+      return
+
+    tensor_tsv = (self._get('/data/plugin/projector/tensor?run=run1&name=var1')
+                  .read())
+    self.assertEqual(tensor_tsv, b'6.0\t6.0')
+
   def testAcceptGzip_compressesResponse(self):
     response = self._get('/data/graph?run=run1&limit_attr_size=1024'
                          '&large_attrs_key=_very_large_attrs',
@@ -295,6 +339,29 @@ class TensorboardServerTest(tf.test.TestCase):
     writer.flush()
     writer.close()
 
+    if 'projector' in REGISTERED_PLUGINS:
+      self._GenerateProjectorTestData(run1_path)
+
+  def _GenerateProjectorTestData(self, run_path):
+    # Write a projector config file in run1.
+    config_path = os.path.join(run_path, 'projector_config.pbtxt')
+    config = ProjectorConfig()
+    config_pbtxt = text_format.MessageToString(config)
+    with tf.gfile.GFile(config_path, 'w') as f:
+      f.write(config_pbtxt)
+
+    # Write a checkpoint with some dummy variables.
+    with tf.Graph().as_default():
+      sess = tf.Session()
+      checkpoint_path = os.path.join(run_path, 'model')
+      tf.get_variable(
+          'var1', [1, 2], initializer=tf.constant_initializer(6.0))
+      tf.get_variable('var2', [10, 10])
+      tf.get_variable('var3', [100, 100])
+      sess.run(tf.initialize_all_variables())
+      saver = tf.train.Saver()
+      saver.save(sess, checkpoint_path)
+
 
 class ParseEventFilesSpecTest(tf.test.TestCase):
 
diff --git a/tensorflow/tensorboard/components/tf-dashboard-common/categorizer.ts b/tensorflow/tensorboard/components/tf-dashboard-common/categorizer.ts
index eba1b0f7f44..e8a9751f08f 100644
--- a/tensorflow/tensorboard/components/tf-dashboard-common/categorizer.ts
+++ b/tensorflow/tensorboard/components/tf-dashboard-common/categorizer.ts
@@ -49,11 +49,25 @@ module Categorizer {
   /* Canonical TensorFlow ops are namespaced using forward slashes.
    * This fallback categorizer categorizes by the top-level namespace.
    */
+  export var topLevelNamespaceCategorizer: Categorizer = splitCategorizer(/\//);
+
   // Try to produce good categorizations on legacy graphs, which often
   // are namespaced like l1_foo/bar or l2_baz/bam.
   // If there is no leading underscore before the first forward slash,
   // then it behaves the same as topLevelNamespaceCategorizer
-  export var rootNameUnderscoreCategorizer = rootNameCategorizer(/[\/_]/);
+  export var legacyUnderscoreCategorizer: Categorizer =
+      splitCategorizer(/[\/_]/);
+
+  export function fallbackCategorizer(s: string): Categorizer {
+    switch (s) {
+      case 'TopLevelNamespaceCategorizer':
+        return topLevelNamespaceCategorizer;
+      case 'LegacyUnderscoreCategorizer':
+        return legacyUnderscoreCategorizer;
+      default:
+        throw new Error('Unrecognized categorization strategy: ' + s);
+    }
+  }
 
   /* An 'extractor' is a function that takes a tag name, and 'extracts' a
    * category name.
@@ -89,33 +103,13 @@ module Categorizer {
     };
   }
 
-  /** Split on a regex, taking just the first element after splitting.
-   * It's like getting the root directory. E.g. if you split on slash, then
-   * 'foo/bar/zod' will go to 'foo'
-   */
-  function rootNameCategorizer(r: RegExp): Categorizer {
-    let extractor = (t: string) => { return t.split(r)[0]; };
+  function splitCategorizer(r: RegExp): Categorizer {
+    let extractor = (t: string) => {
+      return t.split(r)[0];
+    };
     return extractorToCategorizer(extractor);
   }
 
-  /* Split on a regex, taking all the prefix until the last split.
-   * It's like getting the dirname of a path. E.g. if you split on slash, then
-   * 'foo/bar/zod' will go to 'foo/bar'.
-   * In the case where there are no splits (e.g. 'foo') then it uses 'foo' as
-   * the category name.
-   */
-  function dnameExtractor(t: string) {
-    let splits = t.split('/');
-    if (splits.length === 1) {
-      return t;
-    } else {
-      let last = _.last(splits);
-      return t.slice(0, t.length - last.length - 1);
-    }
-  }
-
-  export var directoryNameCategorizer = extractorToCategorizer(dnameExtractor);
-
   export interface CategoryDefinition {
     name: string;
     matches: (t: string) => boolean;
@@ -123,7 +117,9 @@ module Categorizer {
 
   export function defineCategory(ruledef: string): CategoryDefinition {
     let r = new RegExp(ruledef);
-    let f = function(tag: string): boolean { return r.test(tag); };
+    let f = function(tag: string): boolean {
+      return r.test(tag);
+    };
     return { name: ruledef, matches: f };
   }
 
@@ -146,17 +142,6 @@ module Categorizer {
     };
   }
 
-  export function fallbackCategorizer(s: string): Categorizer {
-    switch (s) {
-      case 'DirectoryNameCategorizer':
-        return directoryNameCategorizer;
-      case 'RootNameUnderscoreCategorizer':
-        return rootNameUnderscoreCategorizer;
-      default:
-        throw new Error('Unrecognized categorization strategy: ' + s);
-    }
-  }
-
   export function categorizer(s: CustomCategorization): Categorizer {
     let rules = s.categoryDefinitions.map(defineCategory);
     let fallback = fallbackCategorizer(s.fallbackCategorizer);
diff --git a/tensorflow/tensorboard/components/tf-dashboard-common/test/categorizerTest.ts b/tensorflow/tensorboard/components/tf-dashboard-common/test/categorizerTest.ts
index 43ff5f8850e..c28e1851fdc 100644
--- a/tensorflow/tensorboard/components/tf-dashboard-common/test/categorizerTest.ts
+++ b/tensorflow/tensorboard/components/tf-dashboard-common/test/categorizerTest.ts
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 module Categorizer {
+  let assert = chai.assert;
+
   describe('categorizer', () => {
-    describe('directoryNameCategorizer', () => {
-      it('returns empty array on empty tags',
-         () => { assert.lengthOf(directoryNameCategorizer([]), 0); });
+    describe('topLevelNamespaceCategorizer', () => {
+      it('returns empty array on empty tags', () => {
+        assert.lengthOf(topLevelNamespaceCategorizer([]), 0);
+      });
 
       it('handles a simple case', () => {
         let simple = [
@@ -27,9 +30,9 @@ module Categorizer {
         let expected = [
           {name: 'foo1', tags: ['foo1/bar', 'foo1/zod']},
           {name: 'foo2', tags: ['foo2/bar', 'foo2/zod']},
-          {name: 'gosh/lod', tags: ['gosh/lod/mar', 'gosh/lod/ned']},
+          {name: 'gosh', tags: ['gosh/lod/mar', 'gosh/lod/ned']},
         ];
-        assert.deepEqual(directoryNameCategorizer(simple), expected);
+        assert.deepEqual(topLevelNamespaceCategorizer(simple), expected);
       });
 
       it('orders the categories', () => {
@@ -42,12 +45,12 @@ module Categorizer {
           {name: 'f', tags: ['f']},
           {name: 'g', tags: ['g']},
         ];
-        assert.deepEqual(directoryNameCategorizer(test), expected);
+        assert.deepEqual(topLevelNamespaceCategorizer(test), expected);
       });
 
       it('handles cases where category names overlap node names', () => {
         let test = ['a', 'a/a', 'a/b', 'a/c', 'b', 'b/a'];
-        let actual = directoryNameCategorizer(test);
+        let actual = topLevelNamespaceCategorizer(test);
         let expected = [
           {name: 'a', tags: ['a', 'a/a', 'a/b', 'a/c']},
           {name: 'b', tags: ['b', 'b/a']},
@@ -57,38 +60,17 @@ module Categorizer {
 
       it('handles singleton case', () => {
         assert.deepEqual(
-            directoryNameCategorizer(['a']), [{name: 'a', tags: ['a']}]);
-      });
-
-      it('splits on bottom level name', () => {
-        let example = [
-          'foo1/bar',
-          'foo1/zod',
-          'foo2/bar',
-          'foo2/zod',
-          'gosh/lod/mar',
-          'gosh/lod/ned',
-          'gosh/zod/mar',
-          'gosh/zod/ned/y',
-        ];
-        let expected = [
-          {name: 'foo1', tags: ['foo1/bar', 'foo1/zod']},
-          {name: 'foo2', tags: ['foo2/bar', 'foo2/zod']},
-          {name: 'gosh/lod', tags: ['gosh/lod/mar', 'gosh/lod/ned']},
-          {name: 'gosh/zod', tags: ['gosh/zod/mar']},
-          {name: 'gosh/zod/ned', tags: ['gosh/zod/ned/y']},
-        ];
-        assert.deepEqual(directoryNameCategorizer(example), expected);
+            topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
       });
     });
 
-    describe('RootNameUnderscoreCategorizer', () => {
+    describe('legacyUnderscoreCategorizer', () => {
       it('splits by shorter of first _ or /', () => {
         let tags = [
           'l0_bar/foo', 'l0_bar/baz', 'l0_foo/wob', 'l1_zoink/bla',
           'l1_wibble/woz', 'l1/foo_woink', 'l2/wozzle_wizzle'
         ];
-        let actual = rootNameUnderscoreCategorizer(tags);
+        let actual = legacyUnderscoreCategorizer(tags);
         let expected = [
           {name: 'l0', tags: ['l0_bar/baz', 'l0_bar/foo', 'l0_foo/wob']},
           {name: 'l1', tags: ['l1/foo_woink', 'l1_wibble/woz', 'l1_zoink/bla']},
diff --git a/tensorflow/tensorboard/components/tf-dashboard-common/tf-categorizer.html b/tensorflow/tensorboard/components/tf-dashboard-common/tf-categorizer.html
index db17bd4dab6..4b588f63231 100644
--- a/tensorflow/tensorboard/components/tf-dashboard-common/tf-categorizer.html
+++ b/tensorflow/tensorboard/components/tf-dashboard-common/tf-categorizer.html
@@ -48,8 +48,8 @@ categories are exclusive.
     </div>
     <div id="underscore-categorization">
       <paper-checkbox
-        checked="{{useLegacyCategorizer}}"
-      >Use Legacy Categorizer</paper-checkbox>
+        checked="{{splitOnUnderscore}}"
+      >Split on underscores</paper-checkbox>
     </div>
     <style>
       :host {
@@ -76,14 +76,14 @@ categories are exclusive.
         categoriesAreExclusive: {type: Boolean, value: true},
         fallbackCategorizer: {
           type: String,
-          computed: "chooseFallbackCategorizer(useLegacyCategorizer)"
+          computed: "chooseFallbackCategorizer(splitOnUnderscore)"
         },
-        useLegacyCategorizer: {
+        splitOnUnderscore: {
           type: Boolean,
           notify: true,
-          value: TF.URIStorage.getBooleanInitializer('useLegacyCategorizer',
+          value: TF.URIStorage.getBooleanInitializer('splitOnUnderscore',
               false),
-          observer: '_useLegacyCategorizerObserver'
+          observer: '_splitOnUnderscoreObserver'
         },
         categorizer: {
           type: Object,
@@ -106,15 +106,15 @@ categories are exclusive.
           this._setCategories(categories);
         })
       },
-      chooseFallbackCategorizer: function(useLegacyCategorizer) {
-        if (useLegacyCategorizer) {
-          return "RootNameUnderscoreCategorizer";
+      chooseFallbackCategorizer: function(splitOnUnderscore) {
+        if (splitOnUnderscore) {
+          return "LegacyUnderscoreCategorizer";
         } else {
-          return "DirectoryNameCategorizer";
+          return "TopLevelNamespaceCategorizer";
         }
       },
-      _useLegacyCategorizerObserver: TF.URIStorage.getBooleanObserver(
-          'useLegacyCategorizer', false)
+      _splitOnUnderscoreObserver: TF.URIStorage.getBooleanObserver(
+          'splitOnUnderscore', false)
     });
   </script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/vz-projector/data.ts b/tensorflow/tensorboard/components/vz-projector/data.ts
index 48ed9373458..920cbe695ab 100644
--- a/tensorflow/tensorboard/components/vz-projector/data.ts
+++ b/tensorflow/tensorboard/components/vz-projector/data.ts
@@ -208,7 +208,7 @@ export class DataSet implements scatterPlot.DataSet {
   /** Projects the dataset along the top 10 principal components. */
   projectPCA(): Promise<void> {
     if (this.projections.has('pca-0')) {
-      return Promise.resolve();
+      return Promise.resolve<void>(null);
     }
     return runAsyncTask('Computing PCA...', () => {
       // Approximate pca vectors by sampling the dimensions.
diff --git a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGL.ts b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGL.ts
index 7a959120cea..002707be6c9 100644
--- a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGL.ts
+++ b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGL.ts
@@ -20,8 +20,7 @@ import {ScatterPlotWebGLVisualizerAxes} from './scatterPlotWebGLVisualizerAxes';
 import {getNearFarPoints, getProjectedPointFromIndex, vector3DToScreenCoords} from './util';
 import {dist_2D} from './vector';
 
-const BACKGROUND_COLOR_DAY = 0xffffff;
-const BACKGROUND_COLOR_NIGHT = 0x000000;
+const BACKGROUND_COLOR = 0xffffff;
 
 const MAX_ZOOM = 10;
 const MIN_ZOOM = .05;
@@ -65,7 +64,7 @@ const TAR_2D = {
 
 /**
  * Maintains a three.js instantiation and context,
- * animation state, day/night state, and all other logic that's
+ * animation state, and all other logic that's
  * independent of how a 3D scatter plot is actually rendered. Also holds an
  * array of visualizers and dispatches application events to them.
  */
@@ -100,8 +99,7 @@ export class ScatterPlotWebGL implements ScatterPlot {
   private width: number;
 
   private mode: Mode;
-  private isNight: boolean;
-  private backgroundColor: number;
+  private backgroundColor: number = BACKGROUND_COLOR;
 
   private scene: THREE.Scene;
   private renderer: THREE.WebGLRenderer;
@@ -131,11 +129,11 @@ export class ScatterPlotWebGL implements ScatterPlot {
     // Set up THREE.js.
     this.scene = new THREE.Scene();
     this.renderer = new THREE.WebGLRenderer();
+    this.renderer.setClearColor(BACKGROUND_COLOR, 1);
     this.containerNode.appendChild(this.renderer.domElement);
     this.light = new THREE.PointLight(0xFFECBF, 1, 0);
     this.scene.add(this.light);
     this.makeCamera();
-    this.setDayNightMode(false);
 
     // Render now so no black background appears during startup.
     this.renderer.render(this.scene, this.perspCamera);
@@ -585,6 +583,9 @@ export class ScatterPlotWebGL implements ScatterPlot {
 
   setLabelAccessor(labelAccessor: (index: number) => string) {
     this.labelAccessor = labelAccessor;
+    this.visualizers.forEach(v => {
+      v.onSetLabelAccessor(labelAccessor);
+    });
   }
 
   setMode(mode: Mode) {
@@ -649,13 +650,9 @@ export class ScatterPlotWebGL implements ScatterPlot {
   getHighlightedPoints(): number[] { return this.highlightedPoints; }
 
   setDayNightMode(isNight: boolean) {
-    this.isNight = isNight;
-    this.backgroundColor =
-        (isNight ? BACKGROUND_COLOR_NIGHT : BACKGROUND_COLOR_DAY);
-    this.renderer.setClearColor(this.backgroundColor);
-    this.visualizers.forEach(v => {
-      v.onSetDayNightMode(isNight);
-    });
+    d3.select(this.containerNode)
+        .selectAll('canvas')
+        .style('filter', isNight ? 'invert(100%)' : null);
   }
 
   showAxes(show: boolean) {}
diff --git a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizer.ts b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizer.ts
index c95f8f08d6a..72756e1b19e 100644
--- a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizer.ts
+++ b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizer.ts
@@ -40,6 +40,12 @@ export interface ScatterPlotWebGLVisualizer {
    * initialization here.
    */
   onDataSet(dataSet: DataSet, spriteImage: HTMLImageElement);
+  /**
+   * Called when the label accessor (functor that maps point ids to text labels)
+   * changes. The label accessor is also part of RenderContext, but visualizers
+   * may need it outside of a render call, to learn when it changes.
+   */
+  onSetLabelAccessor(labelAccessor: (index: number) => string);
   /**
    * Called immediately before the main scatter plot performs a picking
    * (selection) render. Set up render state for any geometry to use picking IDs
@@ -52,16 +58,12 @@ export interface ScatterPlotWebGLVisualizer {
    */
   onRender(renderContext: RenderContext);
   /**
-   * Called when the projector updates application state (day / night mode,
-   * projection style, etc). Generally followed by a render.
+   * Called when the projector updates application state (projection style,
+   * etc). Generally followed by a render.
    */
   onUpdate();
   /**
    * Called when the canvas size changes.
    */
   onResize(newWidth: number, newHeight: number);
-  /**
-   * Called when the application toggles between day and night mode.
-   */
-  onSetDayNightMode(isNight: boolean);
 }
diff --git a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizer3DLabels.ts b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizer3DLabels.ts
new file mode 100644
index 00000000000..8a96f4108ca
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizer3DLabels.ts
@@ -0,0 +1,338 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {RenderContext} from './renderContext';
+import {DataSet} from './scatterPlot';
+import {ScatterPlotWebGL} from './scatterPlotWebGL';
+import {ScatterPlotWebGLVisualizer} from './scatterPlotWebGLVisualizer';
+import {createTexture} from './util';
+
+const FONT_SIZE = 80;
+const LABEL_COLOR = 'black';
+const LABEL_BACKGROUND = 'white';
+const MAX_CANVAS_DIMENSION = 8192;
+const NUM_GLYPHS = 256;
+const RGB_ELEMENTS_PER_ENTRY = 3;
+const XYZ_ELEMENTS_PER_ENTRY = 3;
+const UV_ELEMENTS_PER_ENTRY = 2;
+const VERTICES_PER_GLYPH = 2 * 3;  // 2 triangles, 3 verts per triangle
+
+/**
+ * Each label is made up of triangles (two per letter.) Each vertex, then, is
+ * the corner of one of these triangles (and thus the corner of a letter
+ * rectangle.)
+ * Each has the following attributes:
+ *    posObj: The (x, y) position of the vertex within the label, where the
+ *            bottom center of the word is positioned at (0, 0);
+ *    position: The position of the label in worldspace.
+ *    vUv: The (u, v) coordinates that index into the glyphs sheet (range 0, 1.)
+ *    color: The color of the label (matches the cooresponding point's color.)
+ *    wordShown: Boolean. Whether or not the label is visible.
+ */
+
+const VERTEX_SHADER = `
+    attribute vec2 posObj;
+    varying vec2 vUv;
+    attribute vec3 color;
+    varying vec3 vColor;
+    uniform float camPos;
+
+    float getPointScale() {
+      float normalScale =  3.0;
+      // Distance to the camera (world coordinates.) This is the scale factor.
+      // Note that positions of verts are in world space, scaled so that the
+      // lineheight is 1.
+      float distToCam = length((modelViewMatrix * vec4(position, 1.0)).z);
+      float unscale = distToCam;
+      float scale = max(min(unscale * 10.0, normalScale), unscale * 2.0);
+      return scale * ${1 /
+    FONT_SIZE};
+    }
+
+    void main() {
+      vUv = uv;
+      vColor = color;
+
+      // Make label face camera.
+      // 'At' and 'Up' vectors just match that of the camera.
+      vec3 Vat = normalize(vec3(
+        modelViewMatrix[0][2],
+        modelViewMatrix[1][2],
+        modelViewMatrix[2][2]));
+
+      vec3 Vup = normalize(vec3(
+        modelViewMatrix[0][1],
+        modelViewMatrix[1][1],
+        modelViewMatrix[2][1]));
+
+      vec3 Vright = normalize(cross(Vup, Vat));
+      Vup = cross(Vat, Vright);
+      mat4 pointToCamera = mat4(Vright, 0.0, Vup, 0.0, Vat, 0.0, vec3(0), 1.0);
+
+      vec2 posObj = posObj*getPointScale();
+
+      vec4 posRotated = pointToCamera * vec4(posObj, 0.00001, 1.0);
+      vec4 mvPosition = modelViewMatrix * (vec4(position, 0.0) + posRotated);
+      gl_Position = projectionMatrix * mvPosition;
+    }`;
+
+const FRAGMENT_SHADER = `
+    uniform sampler2D texture;
+    uniform bool picking;
+    varying vec2 vUv;
+    varying vec3 vColor;
+
+    void main() {
+      if (picking) {
+        gl_FragColor = vec4(vColor, 1.0);
+      } else {
+        vec4 fromTexture = texture2D(texture, vUv);
+        vec4 color = vec4(vColor, 1.0);
+        gl_FragColor = color + fromTexture;
+      }
+    }`;
+
+type GlyphTexture = {
+  texture: THREE.Texture; lengths: Float32Array; offsets: Float32Array;
+};
+
+/**
+ * Renders the text labels as 3d geometry in the world.
+ */
+export class ScatterPlotWebGLVisualizer3DLabels implements
+    ScatterPlotWebGLVisualizer {
+  private dataSet: DataSet;
+  private scene: THREE.Scene;
+  private labelAccessor: (index: number) => string;
+  private geometry: THREE.BufferGeometry;
+  private material: THREE.ShaderMaterial;
+  private uniforms: Object;
+  private labelsMesh: THREE.Mesh;
+  private positions: THREE.BufferAttribute;
+  private totalVertexCount: number;
+  private labelVertexMap: number[][];
+  private glyphTexture: GlyphTexture;
+
+  constructor(scatterPlotWebGL: ScatterPlotWebGL) {
+    scatterPlotWebGL.onSelection((s: number[]) => this.onSelectionChanged(s));
+    this.createGlyphTexture();
+
+    this.uniforms = {
+      texture: {type: 't', value: this.glyphTexture.texture},
+      picking: {type: 'bool', value: false},
+      camPos: {type: 'float', value: new THREE.Vector3()}
+    };
+
+    this.material = new THREE.ShaderMaterial({
+      uniforms: this.uniforms,
+      transparent: true,
+      side: THREE.DoubleSide,
+      vertexShader: VERTEX_SHADER,
+      fragmentShader: FRAGMENT_SHADER,
+    });
+  }
+
+  private createGlyphTexture() {
+    if (this.glyphTexture) {
+      this.glyphTexture.texture.dispose();
+    }
+
+    let canvas = document.createElement('canvas');
+    canvas.width = MAX_CANVAS_DIMENSION;
+    canvas.height = FONT_SIZE;
+    let ctx = canvas.getContext('2d');
+    ctx.font = 'bold ' + FONT_SIZE * 0.75 + 'px roboto';
+    ctx.textBaseline = 'top';
+    ctx.fillStyle = LABEL_BACKGROUND;
+    ctx.rect(0, 0, canvas.width, canvas.height);
+    ctx.fill();
+    ctx.fillStyle = LABEL_COLOR;
+    let spaceOffset = ctx.measureText(' ').width;
+    // For each letter, store length, position at the encoded index.
+    let glyphLengths = new Float32Array(NUM_GLYPHS);
+    let glyphOffset = new Float32Array(NUM_GLYPHS);
+    let leftCoord = 0;
+    for (let i = 0; i < NUM_GLYPHS; i++) {
+      let text = ' ' + String.fromCharCode(i);
+      let textLength = ctx.measureText(text).width;
+      glyphLengths[i] = textLength - spaceOffset;
+      glyphOffset[i] = leftCoord;
+      ctx.fillText(text, leftCoord - spaceOffset, 0);
+      leftCoord += textLength;
+    }
+    let tex = createTexture(canvas);
+    this.glyphTexture = {
+      texture: tex,
+      lengths: glyphLengths,
+      offsets: glyphOffset
+    };
+  }
+
+  private processLabelVerts() {
+    let numTotalLetters = 0;
+    this.labelVertexMap = [];
+    for (let i = 0; i < this.dataSet.points.length; i++) {
+      let label = this.labelAccessor(i);
+      let vertsArray: number[] = [];
+      for (let j = 0; j < label.length; j++) {
+        for (let k = 0; k < VERTICES_PER_GLYPH; k++) {
+          vertsArray.push(numTotalLetters * VERTICES_PER_GLYPH + k);
+        }
+        numTotalLetters++;
+      }
+      this.labelVertexMap.push(vertsArray);
+    }
+    this.totalVertexCount = numTotalLetters * VERTICES_PER_GLYPH;
+  }
+
+  private createLabelGeometry() {
+    let posArray =
+        new Float32Array(this.totalVertexCount * XYZ_ELEMENTS_PER_ENTRY);
+    let uvArray =
+        new Float32Array(this.totalVertexCount * UV_ELEMENTS_PER_ENTRY);
+    let colorsArray =
+        new Float32Array(this.totalVertexCount * RGB_ELEMENTS_PER_ENTRY);
+    let positionObject = new THREE.BufferAttribute(posArray, 2);
+    let uv = new THREE.BufferAttribute(uvArray, UV_ELEMENTS_PER_ENTRY);
+    let colors = new THREE.BufferAttribute(colorsArray, RGB_ELEMENTS_PER_ENTRY);
+
+    this.geometry = new THREE.BufferGeometry();
+    this.geometry.addAttribute('posObj', positionObject);
+    this.geometry.addAttribute('position', this.positions);
+    this.geometry.addAttribute('uv', uv);
+    this.geometry.addAttribute('color', colors);
+
+    let lettersSoFar = 0;
+    for (let i = 0; i < this.dataSet.points.length; i++) {
+      let label = this.labelAccessor(i);
+      let leftOffset = 0;
+      // Determine length of word in pixels.
+      for (let j = 0; j < label.length; j++) {
+        let letterCode = label.charCodeAt(j);
+        leftOffset += this.glyphTexture.lengths[letterCode];
+      }
+      leftOffset /= -2;  // centers text horizontally around the origin
+      for (let j = 0; j < label.length; j++) {
+        let letterCode = label.charCodeAt(j);
+        let letterWidth = this.glyphTexture.lengths[letterCode];
+        let scale = FONT_SIZE;
+        let right = (leftOffset + letterWidth) / scale;
+        let left = (leftOffset) / scale;
+        let top = FONT_SIZE / scale;
+
+        // First triangle
+        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 0, left, 0);
+        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 1, left, top);
+        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 2, right, 0);
+
+        // Second triangle
+        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 3, left, top);
+        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 4, right, 0);
+        positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 5, right, top);
+
+        // Set UVs based on letter.
+        let uLeft = (this.glyphTexture.offsets[letterCode]);
+        let uRight = (this.glyphTexture.offsets[letterCode] + letterWidth);
+        // Scale so that uvs lie between 0 and 1 on the texture.
+        uLeft /= MAX_CANVAS_DIMENSION;
+        uRight /= MAX_CANVAS_DIMENSION;
+        let vTop = 1;
+        let vBottom = 0;
+        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 0, uLeft, vTop);
+        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 1, uLeft, vBottom);
+        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 2, uRight, vTop);
+        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 3, uLeft, vBottom);
+        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 4, uRight, vTop);
+        uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 5, uRight, vBottom);
+
+        lettersSoFar++;
+        leftOffset += letterWidth;
+      }
+    }
+
+    this.labelsMesh = new THREE.Mesh(this.geometry, this.material);
+  }
+
+  private destroyLabels() {
+    if (this.labelsMesh) {
+      if (this.scene) {
+        this.scene.remove(this.labelsMesh);
+      }
+      this.geometry.dispose();
+      this.labelsMesh = null;
+    }
+  }
+
+  private createLabels() {
+    this.destroyLabels();
+    if (this.labelAccessor) {
+      this.createLabelGeometry();
+    }
+  }
+
+  onRecreateScene(
+      scene: THREE.Scene, sceneIs3D: boolean, backgroundColor: number) {
+    this.scene = scene;
+    if (this.labelsMesh == null) {
+      this.createLabels();
+    }
+    if (this.labelsMesh) {
+      scene.add(this.labelsMesh);
+    }
+  }
+
+  removeAllFromScene(scene: THREE.Scene) {
+    this.destroyLabels();
+  }
+
+  onSetLabelAccessor(labelAccessor: (index: number) => string) {
+    this.labelAccessor = labelAccessor;
+    this.onUpdate();
+  }
+
+  onDataSet(dataSet: DataSet, spriteImage: HTMLImageElement) {
+    this.dataSet = dataSet;
+  }
+
+  onPickingRender(camera: THREE.Camera, cameraTarget: THREE.Vector3) {}
+
+  onRender(renderContext: RenderContext) {
+    this.material.uniforms.texture.value = this.glyphTexture.texture;
+    this.material.uniforms.picking.value = false;
+    this.material.uniforms.camPos.value = renderContext.camera.position;
+  }
+
+  onUpdate() {
+    this.processLabelVerts();
+    let positionArray =
+        new Float32Array(this.totalVertexCount * XYZ_ELEMENTS_PER_ENTRY);
+    this.positions =
+        new THREE.BufferAttribute(positionArray, XYZ_ELEMENTS_PER_ENTRY);
+
+    this.createLabels();
+    if (this.labelsMesh && this.scene) {
+      this.scene.add(this.labelsMesh);
+    }
+    for (let i = 0; i < this.dataSet.points.length; i++) {
+      let pp = this.dataSet.points[i].projectedPoint;
+      this.labelVertexMap[i].forEach((j) => {
+        this.positions.setXYZ(j, pp[0], pp[1], pp[2]);
+      });
+    };
+  }
+
+  onResize(newWidth: number, newHeight: number) {}
+  onSelectionChanged(selection: number[]) {}
+}
diff --git a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerAxes.ts b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerAxes.ts
index bc63b67910a..88154ff496b 100644
--- a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerAxes.ts
+++ b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerAxes.ts
@@ -114,5 +114,5 @@ export class ScatterPlotWebGLVisualizerAxes implements
   onRender(renderContext: RenderContext) {}
   onUpdate() {}
   onResize(newWidth: number, newHeight: number) {}
-  onSetDayNightMode(isNight: boolean) {}
+  onSetLabelAccessor(labelAccessor: (index: number) => string) {}
 }
diff --git a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerCanvasLabels.ts b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerCanvasLabels.ts
index caf52df93a4..9cb3858d920 100644
--- a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerCanvasLabels.ts
+++ b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerCanvasLabels.ts
@@ -20,10 +20,8 @@ import {ScatterPlotWebGLVisualizer} from './scatterPlotWebGLVisualizer';
 import {getProjectedPointFromIndex, vector3DToScreenCoords} from './util';
 import {Point2D} from './vector';
 
-const LABEL_COLOR_DAY = 0x000000;
-const LABEL_COLOR_NIGHT = 0xffffff;
-const LABEL_STROKE_DAY = 0xffffff;
-const LABEL_STROKE_NIGHT = 0x000000;
+const LABEL_COLOR = 0x000000;
+const LABEL_STROKE = 0xffffff;
 
 // The maximum number of labels to draw to keep the frame rate up.
 const SAMPLE_SIZE = 10000;
@@ -40,8 +38,8 @@ export class ScatterPlotWebGLVisualizerCanvasLabels implements
   private gc: CanvasRenderingContext2D;
   private canvas: HTMLCanvasElement;
   private labelCanvasIsCleared = true;
-  private labelColor: number = LABEL_COLOR_DAY;
-  private labelStroke: number = LABEL_STROKE_DAY;
+  private labelColor: number = LABEL_COLOR;
+  private labelStroke: number = LABEL_STROKE;
   private labelsActive: boolean = true;
   private sceneIs3D: boolean = true;
 
@@ -215,13 +213,6 @@ export class ScatterPlotWebGLVisualizerCanvasLabels implements
     this.removeAllLabels();
   }
 
-  onSetDayNightMode(isNight: boolean) {
-    this.labelColor = (isNight ? LABEL_COLOR_NIGHT : LABEL_COLOR_DAY);
-    this.labelStroke = (isNight ? LABEL_STROKE_NIGHT : LABEL_STROKE_DAY);
-  }
-
-  onPickingRender(camera: THREE.Camera, cameraTarget: THREE.Vector3) {}
-
   onRender(rc: RenderContext) {
     if (this.labelsActive) {
       this.makeLabels(
@@ -230,4 +221,7 @@ export class ScatterPlotWebGLVisualizerCanvasLabels implements
           rc.nearestCameraSpacePointZ, rc.farthestCameraSpacePointZ);
     }
   }
+
+  onPickingRender(camera: THREE.Camera, cameraTarget: THREE.Vector3) {}
+  onSetLabelAccessor(labelAccessor: (index: number) => string) {}
 }
diff --git a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerSprites.ts b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerSprites.ts
index 7921dbd885b..2b5da46e4d7 100644
--- a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerSprites.ts
+++ b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerSprites.ts
@@ -17,6 +17,7 @@ import {RenderContext} from './renderContext';
 import {DataSet} from './scatterPlot';
 import {ScatterPlotWebGL} from './scatterPlotWebGL';
 import {ScatterPlotWebGLVisualizer} from './scatterPlotWebGLVisualizer';
+import {createTexture} from './util';
 
 const NUM_POINTS_FOG_THRESHOLD = 5000;
 const MIN_POINT_SIZE = 5.0;
@@ -25,9 +26,6 @@ const IMAGE_SIZE = 30;
 const POINT_COLOR = 0x7575D9;
 const POINT_COLOR_GRAYED = 0x888888;
 
-const BLENDING_DAY = THREE.MultiplyBlending;
-const BLENDING_NIGHT = THREE.AdditiveBlending;
-
 // Constants relating to the indices of buffer arrays.
 /** Item size of a single point in a bufferArray representing colors */
 const RGB_NUM_BYTES = 3;
@@ -141,8 +139,6 @@ export class ScatterPlotWebGLVisualizerSprites implements
   private pickingColors: Float32Array;
   private renderColors: Float32Array;
 
-  private blending: THREE.Blending = BLENDING_DAY;
-
   constructor(scatterPlotWebGL: ScatterPlotWebGL) {
     scatterPlotWebGL.onSelection((s: number[]) => this.onSelectionChanged(s));
   }
@@ -159,7 +155,7 @@ export class ScatterPlotWebGLVisualizerSprites implements
     let image = this.image || canvas;
     // TODO(b/31390553): Pass sprite dim to the renderer.
     let spriteDim = 28.0;
-    let tex = this.createTexture(image);
+    let tex = createTexture(image);
     let pointSize = (this.sceneIs3D ? this.pointSize3D : this.pointSize2D);
     if (this.image) {
       pointSize = IMAGE_SIZE;
@@ -187,7 +183,7 @@ export class ScatterPlotWebGLVisualizerSprites implements
       depthTest: haveImage,
       depthWrite: haveImage,
       fog: true,
-      blending: (this.image ? THREE.NormalBlending : this.blending),
+      blending: (this.image ? THREE.NormalBlending : THREE.MultiplyBlending),
     });
 
     this.pickingMaterial = new THREE.ShaderMaterial({
@@ -198,7 +194,7 @@ export class ScatterPlotWebGLVisualizerSprites implements
       depthTest: true,
       depthWrite: true,
       fog: false,
-      blending: (this.image ? THREE.NormalBlending : this.blending),
+      blending: (this.image ? THREE.NormalBlending : THREE.MultiplyBlending),
     });
 
     // And finally initialize it and add it to the scene.
@@ -338,19 +334,6 @@ export class ScatterPlotWebGLVisualizerSprites implements
     scene.remove(this.points);
   }
 
-  /**
-   * Generate a texture for the points/images and sets some initial params
-   */
-  createTexture(image: HTMLImageElement|HTMLCanvasElement): THREE.Texture {
-    let tex = new THREE.Texture(image);
-    tex.needsUpdate = true;
-    // Used if the texture isn't a power of 2.
-    tex.minFilter = THREE.LinearFilter;
-    tex.generateMipmaps = false;
-    tex.flipY = false;
-    return tex;
-  }
-
   onDataSet(dataSet: DataSet, spriteImage: HTMLImageElement) {
     this.dataSet = dataSet;
     this.image = spriteImage;
@@ -371,10 +354,6 @@ export class ScatterPlotWebGLVisualizerSprites implements
         (selection.length > 0) ? POINT_COLOR_GRAYED : POINT_COLOR;
   }
 
-  onSetDayNightMode(isNight: boolean) {
-    this.blending = (isNight ? BLENDING_NIGHT : BLENDING_DAY);
-  }
-
   onRecreateScene(
       scene: THREE.Scene, sceneIs3D: boolean, backgroundColor: number) {
     this.sceneIs3D = sceneIs3D;
@@ -390,6 +369,7 @@ export class ScatterPlotWebGLVisualizerSprites implements
   }
 
   onResize(newWidth: number, newHeight: number) {}
+  onSetLabelAccessor(labelAccessor: (index: number) => string) {}
 
   onPickingRender(camera: THREE.Camera, cameraTarget: THREE.Vector3) {
     if (!this.geometry) {
diff --git a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerTraces.ts b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerTraces.ts
index cc8b0e49dae..2201c643f55 100644
--- a/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerTraces.ts
+++ b/tensorflow/tensorboard/components/vz-projector/scatterPlotWebGLVisualizerTraces.ts
@@ -186,5 +186,5 @@ export class ScatterPlotWebGLVisualizerTraces implements
   onPickingRender(camera: THREE.Camera, cameraTarget: THREE.Vector3) {}
   onRender(renderContext: RenderContext) {}
   onResize(newWidth: number, newHeight: number) {}
-  onSetDayNightMode(isNight: boolean) {}
+  onSetLabelAccessor(labelAccessor: (index: number) => string) {}
 }
diff --git a/tensorflow/tensorboard/components/vz-projector/util.ts b/tensorflow/tensorboard/components/vz-projector/util.ts
index 0fca320dd2b..453d501c7a0 100644
--- a/tensorflow/tensorboard/components/vz-projector/util.ts
+++ b/tensorflow/tensorboard/components/vz-projector/util.ts
@@ -79,6 +79,20 @@ export function getNearFarPoints(
   return [shortestDist, furthestDist];
 }
 
+/**
+ * Generate a texture for the points/images and sets some initial params
+ */
+export function createTexture(image: HTMLImageElement|
+                              HTMLCanvasElement): THREE.Texture {
+  let tex = new THREE.Texture(image);
+  tex.needsUpdate = true;
+  // Used if the texture isn't a power of 2.
+  tex.minFilter = THREE.LinearFilter;
+  tex.generateMipmaps = false;
+  tex.flipY = false;
+  return tex;
+}
+
 /**
  * Assert that the condition is satisfied; if not, log user-specified message
  * to the console.
diff --git a/tensorflow/tensorboard/components/vz-projector/vz-projector.html b/tensorflow/tensorboard/components/vz-projector/vz-projector.html
index 0b2213c63f8..9d291b21316 100644
--- a/tensorflow/tensorboard/components/vz-projector/vz-projector.html
+++ b/tensorflow/tensorboard/components/vz-projector/vz-projector.html
@@ -624,7 +624,7 @@ paper-listbox .pca-item {
           <a class="euclidean" href="javascript:void(0);">euclidean</a>
         </div>
       </div>
-      <p>Nearest points to <b id="nn-title"></b></p>
+      <p>Nearest points:
       <div class="nn-list"></div>
     </div>
 
diff --git a/tensorflow/tensorboard/components/vz-projector/vz-projector.ts b/tensorflow/tensorboard/components/vz-projector/vz-projector.ts
index 47dad305afb..45bca9186e8 100644
--- a/tensorflow/tensorboard/components/vz-projector/vz-projector.ts
+++ b/tensorflow/tensorboard/components/vz-projector/vz-projector.ts
@@ -26,7 +26,6 @@ import {ColorOption, DataPanel} from './vz-projector-data-panel';
 // tslint:disable-next-line:no-unused-variable
 import {PolymerElement, PolymerHTMLElement} from './vz-projector-util';
 
-
 /** T-SNE perplexity. Roughly how many neighbors each point influences. */
 let perplexity: number = 30;
 /** T-SNE learning rate. */
@@ -37,12 +36,11 @@ let dimension = 3;
 let numNN = 100;
 
 /** Highlight stroke color for the nearest neighbors. */
-const NN_HIGHLIGHT_COLOR = '#6666FA';
+const NN_HIGHLIGHT_COLOR = '#FA6666';
 /** Color to denote a missing value. */
 const MISSING_VALUE_COLOR = 'black';
 /** Highlight stroke color for the selected point */
-const POINT_HIGHLIGHT_COLOR_DAY = 'black';
-const POINT_HIGHLIGHT_COLOR_NIGHT = new THREE.Color(0xFFE11F).getStyle();
+const POINT_HIGHLIGHT_COLOR = 'black';
 
 /** Color scale for nearest neighbors. */
 const NN_COLOR_SCALE =
@@ -99,8 +97,6 @@ export class Projector extends ProjectorPolymer {
   private dim: number;
   private selectedDistance: (a: number[], b: number[]) => number;
   private highlightedPoints: {index: number, color: string}[];
-  // The index of a point that has been individually clicked.
-  private clickedPoint: number;
   // The index of all selected points.
   private selectedPoints: number[];
   private centroidValues: any;
@@ -125,7 +121,6 @@ export class Projector extends ProjectorPolymer {
     this.hasPcaZ = true;
     this.selectedDistance = vector.cosDistNorm;
     this.highlightedPoints = [];
-    this.clickedPoint = null;
     this.selectedPoints = [];
     this.centroidValues = {xLeft: null, xRight: null, yUp: null, yDown: null};
     this.centroids = {xLeft: null, xRight: null, yUp: null, yDown: null};
@@ -362,12 +357,8 @@ export class Projector extends ProjectorPolymer {
           searchBoxInfo.style('color', null).text(`${indices.length} matches.`);
           this.showTab('inspector');
           let neighbors = this.findNeighbors(indices[0]);
-          if (indices.length === 1) {
-            this.clickedPoint = indices[0];
-            this.scatterPlot.clickOnPoint(this.clickedPoint);
-          }
           this.selectedPoints = indices;
-          this.updateNNList(neighbors);
+          this.updateInspectorPane(neighbors);
         }
         this.selectionWasUpdated();
       }
@@ -397,7 +388,7 @@ export class Projector extends ProjectorPolymer {
       self.selectedDistance = vector.dist;
       if (self.selectedPoints.length > 0) {
         let neighbors = self.findNeighbors(self.selectedPoints[0]);
-        self.updateNNList(neighbors);
+        self.updateInspectorPane(neighbors);
       }
     });
 
@@ -407,7 +398,7 @@ export class Projector extends ProjectorPolymer {
       self.selectedDistance = vector.cosDistNorm;
       if (self.selectedPoints.length > 0) {
         let neighbors = self.findNeighbors(self.selectedPoints[0]);
-        self.updateNNList(neighbors);
+        self.updateInspectorPane(neighbors);
       }
     });
 
@@ -423,9 +414,8 @@ export class Projector extends ProjectorPolymer {
     let modeIsNight = dayNightModeButton.classed('selected');
     dayNightModeButton.on('click', () => {
       modeIsNight = !modeIsNight;
-      this.scatterPlot.setDayNightMode(modeIsNight);
-      this.scatterPlot.update();
       dayNightModeButton.classed('selected', modeIsNight);
+      this.scatterPlot.setDayNightMode(modeIsNight);
     });
 
     // Resize
@@ -449,7 +439,6 @@ export class Projector extends ProjectorPolymer {
           new ScatterPlotWebGLVisualizerCanvasLabels(container));
 
       this.scatterPlot = scatterPlotWebGL;
-      this.scatterPlot.setDayNightMode(modeIsNight);
     }
 
     this.scatterPlot.onHover(hoveredIndex => {
@@ -458,11 +447,8 @@ export class Projector extends ProjectorPolymer {
       } else {
         let point = this.points[hoveredIndex];
         this.dom.select('#hoverInfo').text(point.metadata['label']);
-        this.highlightedPoints = [{
-          index: hoveredIndex,
-          color: modeIsNight ? POINT_HIGHLIGHT_COLOR_NIGHT :
-                               POINT_HIGHLIGHT_COLOR_DAY,
-        }];
+        this.highlightedPoints =
+            [{index: hoveredIndex, color: POINT_HIGHLIGHT_COLOR}];
       }
       this.selectionWasUpdated();
     });
@@ -499,29 +485,24 @@ export class Projector extends ProjectorPolymer {
     });
   }
 
-  private updateSelection(selectedPoints: number[]) {
+  private updateSelection(points: number[]) {
     // If no points are selected, unselect everything.
-    if (!selectedPoints.length) {
-      this.clickedPoint = null;
+    if (!points.length) {
       this.selectedPoints = [];
-      this.updateNNList([]);
-    } else if (selectedPoints.length === 1) {
+      this.updateInspectorPane([]);
+    } else if (points.length === 1) {
       // If only one point is selected, we want to get its nearest neighbors
       // and change the UI accordingly.
-      this.clickedPoint = selectedPoints[0];
       this.showTab('inspector');
-      let neighbors = this.findNeighbors(this.clickedPoint);
-      this.selectedPoints =
-          [this.clickedPoint].concat(neighbors.map(n => n.index));
-      this.updateNNList(neighbors);
+      let neighbors = this.findNeighbors(points[0]);
+      this.selectedPoints = [points[0]].concat(neighbors.map(n => n.index));
+      this.updateInspectorPane(neighbors);
     } else {
-      this.clickedPoint = null;
       // Otherwise, select all points and hide nearest neighbors list.
-      this.selectedPoints = selectedPoints as number[];
+      this.selectedPoints = points;
       this.highlightedPoints = [];
-      this.updateNNList([]);
+      this.updateInspectorPane([]);
     }
-    this.updateMetadata();
     this.selectionWasUpdated();
   }
 
@@ -615,38 +596,36 @@ export class Projector extends ProjectorPolymer {
     let metadataContainerElement = this.dom.select('.ink-panel-metadata');
     metadataContainerElement.selectAll('*').remove();
 
-    let display = false;
-    if (this.clickedPoint != null) {
-      let selectedPoint = this.points[this.clickedPoint];
+    let point = this.points[this.selectedPoints[0]];
+    this.dom.select('.ink-panel-metadata-container')
+        .style('display', point != null ? '' : 'none');
 
-      for (let metadataKey in selectedPoint.metadata) {
-        if (!selectedPoint.metadata.hasOwnProperty(metadataKey)) {
-          continue;
-        }
-        let rowElement = document.createElement('div');
-        rowElement.className = 'ink-panel-metadata-row vz-projector';
-
-        let keyElement = document.createElement('div');
-        keyElement.className = 'ink-panel-metadata-key vz-projector';
-        keyElement.textContent = metadataKey;
-
-        let valueElement = document.createElement('div');
-        valueElement.className = 'ink-panel-metadata-value vz-projector';
-        valueElement.textContent = '' + selectedPoint.metadata[metadataKey];
-
-        rowElement.appendChild(keyElement);
-        rowElement.appendChild(valueElement);
-
-        metadataContainerElement.append(function() {
-          return this.appendChild(rowElement);
-        });
-      }
-
-      display = true;
+    if (point == null) {
+      return;
     }
 
-    this.dom.select('.ink-panel-metadata-container')
-        .style('display', display ? '' : 'none');
+    for (let metadataKey in point.metadata) {
+      if (!point.metadata.hasOwnProperty(metadataKey)) {
+        continue;
+      }
+      let rowElement = document.createElement('div');
+      rowElement.className = 'ink-panel-metadata-row vz-projector';
+
+      let keyElement = document.createElement('div');
+      keyElement.className = 'ink-panel-metadata-key vz-projector';
+      keyElement.textContent = metadataKey;
+
+      let valueElement = document.createElement('div');
+      valueElement.className = 'ink-panel-metadata-value vz-projector';
+      valueElement.textContent = '' + point.metadata[metadataKey];
+
+      rowElement.appendChild(keyElement);
+      rowElement.appendChild(valueElement);
+
+      metadataContainerElement.append(function() {
+        return this.appendChild(rowElement);
+      });
+    }
   }
 
   private selectionWasUpdated() {
@@ -692,19 +671,15 @@ export class Projector extends ProjectorPolymer {
   }
 
   /** Updates the nearest neighbors list in the inspector. */
-  private updateNNList(neighbors: knn.NearestEntry[]) {
+  private updateInspectorPane(neighbors: knn.NearestEntry[]) {
+    this.updateMetadata();
     let nnlist = this.dom.select('.nn-list');
     nnlist.html('');
 
     if (neighbors.length === 0) {
-      this.dom.select('#nn-title').text('');
       return;
     }
 
-    let selectedPoint = this.points[this.clickedPoint];
-    this.dom.select('#nn-title')
-        .text(selectedPoint != null ? selectedPoint.metadata['label'] : '');
-
     let minDist = neighbors.length > 0 ? neighbors[0].dist : 0;
     let n = nnlist.selectAll('.neighbor')
                 .data(neighbors)
diff --git a/tensorflow/tensorboard/plugins/__init__.py b/tensorflow/tensorboard/plugins/__init__.py
new file mode 100644
index 00000000000..defcd304fa4
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Holds the list of registered plugins to TensorBoard."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.tensorboard.plugins.projector.plugin import ProjectorPlugin
+# Map of registered plugins in TensorBoard.
+REGISTERED_PLUGINS = {'projector': ProjectorPlugin()}
diff --git a/tensorflow/tensorboard/plugins/base_plugin.py b/tensorflow/tensorboard/plugins/base_plugin.py
new file mode 100644
index 00000000000..3ee392d1b6b
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/base_plugin.py
@@ -0,0 +1,55 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorBoard Plugin abstract base class.
+
+Every plugin in TensorBoard must extend and implement the abstract methods of
+this base class.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from abc import ABCMeta
+from abc import abstractmethod
+
+
+class TBPlugin(object):
+  """TensorBoard plugin interface. Every plugin must extend from this class."""
+  __metaclass__ = ABCMeta
+
+  def initialize(self, handler):
+    """Initializes the plugin.
+
+    Args:
+      handler: The tensorboard http handler that has methods that are used
+               by plugins such as serving json or gzip response.
+    """
+    self.handler = handler
+
+  @abstractmethod
+  def get_plugin_handlers(self, run_paths):
+    """Returns a set of http handlers that the plugin implements.
+
+    Each handler gets registered with the tensorboard handler and is served
+    under a prefix path that includes the name of the plugin.
+
+    Args:
+      run_paths: A dict mapping a run name to an event file path.
+
+    Returns:
+      A dict mapping route paths to http handler methods.
+    """
+    raise NotImplementedError()
diff --git a/tensorflow/tensorboard/plugins/projector/plugin.py b/tensorflow/tensorboard/plugins/projector/plugin.py
new file mode 100644
index 00000000000..4dd153d9beb
--- /dev/null
+++ b/tensorflow/tensorboard/plugins/projector/plugin.py
@@ -0,0 +1,194 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Embedding Projector plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import six
+from google.protobuf import text_format
+from tensorflow.contrib.tensorboard.plugins.projector import PROJECTOR_FILENAME
+from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import ProjectorConfig
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.pywrap_tensorflow import NewCheckpointReader
+from tensorflow.python.training.saver import latest_checkpoint
+from tensorflow.tensorboard.plugins.base_plugin import TBPlugin
+
+# HTTP routes.
+INFO_ROUTE = '/info'
+TENSOR_ROUTE = '/tensor'
+METADATA_ROUTE = '/metadata'
+RUNS_ROUTE = '/runs'
+
+# Limit for the number of points we send to the browser.
+LIMIT_NUM_POINTS = 50000
+
+
+class ProjectorPlugin(TBPlugin):
+  """Embedding projector."""
+
+  def get_plugin_handlers(self, run_paths):
+    self.configs, self.config_fpaths = self._read_config_files(run_paths)
+    self.readers = {}
+
+    return {
+        RUNS_ROUTE: self._serve_runs,
+        INFO_ROUTE: self._serve_info,
+        TENSOR_ROUTE: self._serve_tensor,
+        METADATA_ROUTE: self._serve_metadata,
+    }
+
+  def _read_config_files(self, run_paths):
+    configs = {}
+    config_fpaths = {}
+    for run_name, logdir in six.iteritems(run_paths):
+      config_fpath = os.path.join(logdir, PROJECTOR_FILENAME)
+      if not file_io.file_exists(config_fpath):
+        # Skip runs that have no config file.
+        continue
+      # Read the config file.
+      file_content = file_io.read_file_to_string(config_fpath).decode('utf-8')
+      config = ProjectorConfig()
+      text_format.Merge(file_content, config)
+
+      if not config.model_checkpoint_path:
+        # See if you can find a checkpoint file in the logdir.
+        ckpt_path = latest_checkpoint(logdir)
+        if not ckpt_path:
+          # Or in the parent of logdir.
+          ckpt_path = latest_checkpoint(os.path.join('../', logdir))
+          if not ckpt_path:
+            logging.warning('Cannot find model checkpoint in %s', logdir)
+            continue
+        config.model_checkpoint_path = ckpt_path
+
+      # Sanity check for the checkpoint file.
+      if not file_io.file_exists(config.model_checkpoint_path):
+        logging.warning('Checkpoint file %s not found',
+                        config.model_checkpoint_path)
+        continue
+      configs[run_name] = config
+      config_fpaths[run_name] = config_fpath
+    return configs, config_fpaths
+
+  def _get_reader_for_run(self, run):
+    if run in self.readers:
+      return self.readers[run]
+
+    config = self.configs[run]
+    reader = NewCheckpointReader(config.model_checkpoint_path)
+    self.readers[run] = reader
+    return reader
+
+  def _get_metadata_file_for_tensor(self, tensor_name, config):
+    if not config.embedding:
+      return None
+    for info in config.embedding:
+      if info.tensor_name == tensor_name:
+        return info.metadata_path
+    return None
+
+  def _serve_runs(self, query_params):
+    """Returns a list of runs that have embeddings."""
+    self.handler.send_json_response(list(self.configs.keys()))
+
+  def _serve_info(self, query_params):
+    run = query_params.get('run')
+    if run is None:
+      self.handler.send_error(400, 'query parameter "run" is required')
+      return
+    if run not in self.configs:
+      self.handler.send_error(400, 'Unknown run: %s' % run)
+      return
+
+    config = self.configs[run]
+    reader = self._get_reader_for_run(run)
+    var_map = reader.get_variable_to_shape_map()
+    embedding_map = {name: {
+        'name': name,
+        'shape': shape,
+        'metadataFile': self._get_metadata_file_for_tensor(name, config)
+    }
+                     for name, shape in six.iteritems(var_map)
+                     if len(shape) == 2}
+    self.handler.send_json_response({
+        'tensors': embedding_map,
+        'checkpointFile': config.model_checkpoint_path,
+    })
+
+  def _serve_metadata(self, query_params):
+    run = query_params.get('run')
+    if run is None:
+      self.handler.send_error(400, 'query parameter "run" is required')
+      return
+
+    name = query_params.get('name')
+    if name is None:
+      self.handler.send_error(400, 'query parameter "name" is required')
+      return
+    if run not in self.configs:
+      self.handler.send_error(400, 'Unknown run: %s' % run)
+      return
+
+    config = self.configs[run]
+    fpath = self._get_metadata_file_for_tensor(name, config)
+    if not fpath:
+      self.handler.send_error(
+          400, 'Not metadata file found for tensor %s in the config file %s' %
+          (name, self.config_fpaths[run]))
+      return
+    if not file_io.file_exists(fpath) or file_io.is_directory(fpath):
+      self.handler.send_error(400, '%s is not a file' % fpath)
+      return
+
+    with file_io.FileIO(fpath, 'r') as f:
+      lines = []
+      for line in f:
+        lines.append(line)
+        if len(lines) >= LIMIT_NUM_POINTS:
+          break
+    self.handler.send_gzip_response(''.join(lines), 'text/plain')
+
+  def _serve_tensor(self, query_params):
+    run = query_params.get('run')
+    if run is None:
+      self.handler.send_error(400, 'query parameter "run" is required')
+      return
+
+    name = query_params.get('name')
+    if name is None:
+      self.handler.send_error(400, 'query parameter "name" is required')
+      return
+
+    if run not in self.configs:
+      self.handler.send_error(400, 'Unknown run: %s' % run)
+      return
+
+    reader = self._get_reader_for_run(run)
+    config = self.configs[run]
+    if not reader.has_tensor(name):
+      self.handler.send_error(400, 'Tensor %s not found in checkpoint dir %s' %
+                              (name, config.model_checkpoint_path))
+      return
+    tensor = reader.get_tensor(name)
+    # Sample the tensor
+    tensor = tensor[:LIMIT_NUM_POINTS]
+    # Stream it as TSV.
+    tsv = '\n'.join(['\t'.join([str(val) for val in row]) for row in tensor])
+    self.handler.send_gzip_response(tsv, 'text/plain')
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 8dffbfd2d95..f74ddc588d1 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -214,7 +214,7 @@ source "${VENV_DIR}/bin/activate" || \
 
 # Force tensorflow reinstallation. Otherwise it may not get installed from
 # last build if it had the same version number as previous build.
-PIP_FLAGS="--upgrade --force-reinstall"
+PIP_FLAGS="--upgrade --force-reinstall --no-deps"
 pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
 echo "Successfully installed pip package ${WHL_PATH}"
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index dfea023b563..69c38c651fc 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -22,19 +22,30 @@ set -e
 pip install wheel
 pip3 install wheel
 
-# Use pip to install numpy to the latest version, instead of 1.8.2 through
-# apt-get
-wget -q https://pypi.python.org/packages/17/f3/404bc85be67150663024d2bb5af654c7d16cf678077690dda27b91be14eb/numpy-1.8.2-cp27-cp27mu-manylinux1_x86_64.whl#md5=3ccf5c004fc99bd06dd443de80d622e6
-mv numpy-1.8.2-cp27-cp27mu-manylinux1_x86_64.whl \
-   numpy-1.8.2-cp27-none-linux_x86_64.whl
-pip install numpy-1.8.2-cp27-none-linux_x86_64.whl
-rm numpy-1.8.2-cp27-none-linux_x86_64.whl
+# Install six.
+pip install --upgrade six==1.10.0
+pip3 install --upgrade six==1.10.0
 
-wget -q https://pypi.python.org/packages/33/7d/46d8905d39f462e0f6d1f38e1d165adc2939b9f91ca800e1cba8ef0c0f24/numpy-1.8.2-cp34-cp34m-manylinux1_x86_64.whl#md5=528b2b555d2b6979f10e444cacc04fc9
-mv numpy-1.8.2-cp34-cp34m-manylinux1_x86_64.whl \
-   numpy-1.8.2-cp34-none-linux_x86_64.whl
-pip3 install numpy-1.8.2-cp34-none-linux_x86_64.whl
-rm numpy-1.8.2-cp34-none-linux_x86_64.whl
+# Install protobuf.
+pip install --upgrade protobuf==3.0.0
+pip3 install --upgrade protobuf==3.0.0
+
+# Remove obsolete version of six, which can sometimes confuse virtualenv.
+rm -rf /usr/lib/python3/dist-packages/six*
+
+# Use pip to install numpy to a modern version, instead of 1.8.2 that comes
+# with apt-get in ubuntu:14.04.
+wget -q https://pypi.python.org/packages/06/92/3c786303889e6246971ad4c48ac2b4e37a1b1c67c0dc2106dc85cb15c18e/numpy-1.11.0-cp27-cp27mu-manylinux1_x86_64.whl#md5=6ffb66ff78c28c55bfa09a2ceee487df
+mv numpy-1.11.0-cp27-cp27mu-manylinux1_x86_64.whl \
+   numpy-1.11.0-cp27-none-linux_x86_64.whl
+pip install numpy-1.11.0-cp27-none-linux_x86_64.whl
+rm numpy-1.11.0-cp27-none-linux_x86_64.whl
+
+wget -q https://pypi.python.org/packages/ea/ca/5e48a68be496e6f79c3c8d90f7c03ea09bbb154ea4511f5b3d6c825cefe5/numpy-1.11.0-cp34-cp34m-manylinux1_x86_64.whl#md5=08a002aeffa20354aa5045eadb549361
+mv numpy-1.11.0-cp34-cp34m-manylinux1_x86_64.whl \
+   numpy-1.11.0-cp34-none-linux_x86_64.whl
+pip3 install numpy-1.11.0-cp34-none-linux_x86_64.whl
+rm numpy-1.11.0-cp34-none-linux_x86_64.whl
 
 # Use pip to install scipy to get the latest version, instead of 0.13 through
 # apt-get.
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index ba4293cb276..f1d0961e25c 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -63,6 +63,15 @@ wget -q https://bootstrap.pypa.io/get-pip.py
 python3.5 get-pip.py
 rm -f get-pip.py
 
+# Install six.
+pip3.5 install --upgrade six==1.10.0
+
+# Install protobuf.
+pip3.5 install --upgrade protobuf==3.0.0
+
+# Remove obsolete version of six, which can sometimes confuse virtualenv.
+rm -rf /usr/lib/python3/dist-packages/six*
+
 # Install numpy, scipy and scikit-learn required by the builds
 pip3.5 install --upgrade numpy==1.11.0
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index a8a8193c8ec..7228197eb64 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -19,27 +19,19 @@ from __future__ import print_function
 
 import fnmatch
 import os
-import platform
 import re
 import sys
 
-from setuptools import find_packages, setup, Command, Extension
+from setuptools import find_packages, setup, Command
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
 _VERSION = '0.10.0'
 
-numpy_version = "1.8.2"
-if platform.system() == "Darwin":
-  # There are bugs with numpy pip installation on OS X prior to
-  # 1.10.1, so on mac we require a higher version than on other
-  # platforms.
-  numpy_version = "1.10.1"
-
 REQUIRED_PACKAGES = [
-    'numpy >= %s' % numpy_version,
+    'numpy >= 1.11.0',
     'six >= 1.10.0',
-    'protobuf == 3.0.0b2',
+    'protobuf == 3.0.0',
 ]
 
 # python3 requires wheel 0.26
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index 750d20fdca3..83dace61e48 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -28,6 +28,7 @@ def tf_cc_logged_benchmark(
       tags = all_tags,
       srcs = ["//tensorflow/tools/test:run_and_gather_logs.py"],
       args = [
+          "--name=//%s:%s" % (PACKAGE_NAME, name),
           "--test_name=" + target
       ],
       data = [
diff --git a/tensorflow/tools/test/run_and_gather_logs.py b/tensorflow/tools/test/run_and_gather_logs.py
index 42cdf683156..a72dac0abbf 100644
--- a/tensorflow/tools/test/run_and_gather_logs.py
+++ b/tensorflow/tools/test/run_and_gather_logs.py
@@ -47,6 +47,7 @@ from tensorflow.tools.test import run_and_gather_logs_lib
 
 FLAGS = tf.app.flags.FLAGS
 
+tf.app.flags.DEFINE_string("name", "", """Benchmark target identifier.""")
 tf.app.flags.DEFINE_string("test_name", "", """Test target to run.""")
 tf.app.flags.DEFINE_string(
     "test_args", "", """Test arguments, space separated.""")
@@ -73,10 +74,11 @@ def gather_build_configuration():
 
 
 def main(unused_args):
+  name = FLAGS.name
   test_name = FLAGS.test_name
   test_args = FLAGS.test_args
   test_results, _ = run_and_gather_logs_lib.run_and_gather_logs(
-      test_name, test_args)
+      name, test_name, test_args)
 
   # Additional bits we receive from bazel
   test_results.build_configuration.CopyFrom(gather_build_configuration())
diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py
index 03697ba2054..f787eea1ef8 100644
--- a/tensorflow/tools/test/run_and_gather_logs_lib.py
+++ b/tensorflow/tools/test/run_and_gather_logs_lib.py
@@ -44,10 +44,12 @@ def get_git_commit_sha():
   return os.getenv("GIT_COMMIT")
 
 
-def process_test_logs(test_name, test_args, start_time, run_time, log_files):
+def process_test_logs(
+    name, test_name, test_args, start_time, run_time, log_files):
   """Gather test information and put it in a TestResults proto.
 
   Args:
+    name: Benchmark target identifier.
     test_name:  A unique bazel target, e.g. "//path/to:test"
     test_args:  A string containing all arguments to run the target with.
 
@@ -60,6 +62,7 @@ def process_test_logs(test_name, test_args, start_time, run_time, log_files):
   """
 
   results = test_log_pb2.TestResults()
+  results.name = name
   results.target = test_name
   results.start_time = start_time
   results.run_time = run_time
@@ -85,10 +88,11 @@ def process_benchmarks(log_files):
   return benchmarks
 
 
-def run_and_gather_logs(test_name, test_args):
+def run_and_gather_logs(name, test_name, test_args):
   """Run the bazel test given by test_name.  Gather and return the logs.
 
   Args:
+    name: Benchmark target identifier.
     test_name: A unique bazel target, e.g. "//path/to:test"
     test_args: A string containing all arguments to run the target with.
 
@@ -138,7 +142,8 @@ def run_and_gather_logs(test_name, test_args):
     run_time = time.time() - start_time
     log_files = tf.gfile.Glob("{}*".format(test_file_prefix))
 
-    return (process_test_logs(test_name, test_args, start_time=int(start_time),
+    return (process_test_logs(name, test_name, test_args,
+                              start_time=int(start_time),
                               run_time=run_time, log_files=log_files),
             mangled_test_name)
 
diff --git a/tensorflow/user_ops/BUILD b/tensorflow/user_ops/BUILD
index 6889ab8e56a..89c8f02d11c 100644
--- a/tensorflow/user_ops/BUILD
+++ b/tensorflow/user_ops/BUILD
@@ -5,14 +5,6 @@ package(
     default_visibility = ["//tensorflow:internal"],
 )
 
-package_group(
-    name = "friends",
-    packages = [
-        "//learning/serving/...",
-        "//platforms/techila/...",
-    ],
-)
-
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index ac92050fe06..4850741f2cf 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -10,35 +10,40 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     print("path_prefix was specified to tf_workspace but is no longer used and will be removed in the future.")
   if tf_repo_name:
     print("tf_repo_name was specified to tf_workspace but is no longer used and will be removed in the future.")
+
   # These lines need to be changed when updating Eigen. They are parsed from
-  # this file by the cmake and make builds to determine the eigen version and hash.
-  eigen_version = "6bcd74d2fa40"
-  eigen_sha256 = "df3ca8a395fb615003762b8748c03e3aa7a8932b5674dbb5a6bd3343cc3f408d"
+  # this file by the cmake and make builds to determine the eigen version and
+  # hash.
+  eigen_version = "46ee714e25d5"
+  eigen_sha256 = "d2ba02303c20d6ddc1a922f7e0e176ef841514545e053388845359aa62176912"
 
   native.new_http_archive(
     name = "eigen_archive",
-    url = "https://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz",
+    url = "http://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz",
     sha256 = eigen_sha256,
     strip_prefix = "eigen-eigen-" + eigen_version,
     build_file = str(Label("//:eigen.BUILD")),
   )
 
-  native.git_repository(
+  native.http_archive(
     name = "com_googlesource_code_re2",
-    remote = "https://github.com/google/re2.git",
-    commit = "7bab3dc83df6a838cc004cc7a7f51d5fe1a427d5",
+    url = "http://github.com/google/re2/archive/7bab3dc83df6a838cc004cc7a7f51d5fe1a427d5.tar.gz",
+    sha256 = "ef91af8850f734c8be65f2774747f4c2d8d81e556ba009faa79b4dd8b2759555",
+    strip_prefix = "re2-7bab3dc83df6a838cc004cc7a7f51d5fe1a427d5",
   )
 
-  native.git_repository(
+  native.http_archive(
     name = "gemmlowp",
-    remote = "https://github.com/google/gemmlowp.git",
-    commit = "8b20dd2ce142115857220bd6a35e8a081b3e0829",
+    url = "http://github.com/google/gemmlowp/archive/8b20dd2ce142115857220bd6a35e8a081b3e0829.tar.gz",
+    sha256 = "9cf5f1e3d64b3632dbae5c65efb79f4374ca9ac362d788fc61e086af937ff6d7",
+    strip_prefix = "gemmlowp-8b20dd2ce142115857220bd6a35e8a081b3e0829",
   )
 
   native.new_http_archive(
     name = "farmhash_archive",
-    url = "https://github.com/google/farmhash/archive/34c13ddfab0e35422f4c3979f360635a8c050260.zip",
+    url = "http://github.com/google/farmhash/archive/34c13ddfab0e35422f4c3979f360635a8c050260.zip",
     sha256 = "e3d37a59101f38fd58fb799ed404d630f0eee18bfc2a2433910977cc8fea9c28",
+    strip_prefix = "farmhash-34c13ddfab0e35422f4c3979f360635a8c050260/src",
     build_file = str(Label("//:farmhash.BUILD")),
   )
 
@@ -47,24 +52,26 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     actual = "@farmhash//:farmhash",
   )
 
-  native.git_repository(
+  native.http_archive(
     name = "highwayhash",
-    remote = "https://github.com/google/highwayhash.git",
-    commit = "4bce8fc6a9ca454d9d377dbc4c4d33488bbab78f",
-    init_submodules = True,
+    url = "http://github.com/google/highwayhash/archive/4bce8fc6a9ca454d9d377dbc4c4d33488bbab78f.tar.gz",
+    sha256 = "b159a62fb05e5f6a6be20aa0df6a951ebf44a7bb96ed2e819e4e35e17f56854d",
+    strip_prefix = "highwayhash-4bce8fc6a9ca454d9d377dbc4c4d33488bbab78f",
   )
 
   native.new_http_archive(
     name = "jpeg_archive",
     url = "http://www.ijg.org/files/jpegsrc.v9a.tar.gz",
     sha256 = "3a753ea48d917945dd54a2d97de388aa06ca2eb1066cbfdc6652036349fe05a7",
+    strip_prefix = "jpeg-9a",
     build_file = str(Label("//:jpeg.BUILD")),
   )
 
   native.new_http_archive(
     name = "png_archive",
-    url = "https://github.com/glennrp/libpng/archive/v1.2.53.zip",
+    url = "http://github.com/glennrp/libpng/archive/v1.2.53.zip",
     sha256 = "c35bcc6387495ee6e757507a68ba036d38ad05b415c2553b3debe2a57647a692",
+    strip_prefix = "libpng-1.2.53",
     build_file = str(Label("//:png.BUILD")),
   )
 
@@ -72,13 +79,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     name = "gif_archive",
     url = "http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
     sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
+    strip_prefix = "giflib-5.1.4/lib",
     build_file = str(Label("//:gif.BUILD")),
   )
 
   native.new_http_archive(
     name = "six_archive",
-    url = "https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz#md5=34eed507548117b2ab523ab14b2f8b55",
+    url = "http://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
     sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
+    strip_prefix = "six-1.10.0",
     build_file = str(Label("//:six.BUILD")),
   )
 
@@ -87,16 +96,18 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     actual = "@six_archive//:six",
   )
 
-  native.git_repository(
+  native.http_archive(
     name = "protobuf",
-    remote = "https://github.com/google/protobuf",
-    commit = "1a586735085e817b1f52e53feec92ce418049f69",  # Release 3.0.2.
+    url = "http://github.com/google/protobuf/archive/v3.0.2.tar.gz",
+    sha256 = "b700647e11556b643ccddffd1f41d8cb7704ed02090af54cc517d44d912d11c1",
+    strip_prefix = "protobuf-3.0.2",
   )
 
   native.new_http_archive(
     name = "gmock_archive",
     url = "http://pkgs.fedoraproject.org/repo/pkgs/gmock/gmock-1.7.0.zip/073b984d8798ea1594f5e44d85b20d66/gmock-1.7.0.zip",
     sha256 = "26fcbb5925b74ad5fc8c26b0495dfc96353f4d553492eb97e85a8a6d2f43095b",
+    strip_prefix = "gmock-1.7.0",
     build_file = str(Label("//:gmock.BUILD")),
   )
 
@@ -127,11 +138,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     actual = "@protobuf//:protoc_lib",
   )
 
-  native.new_git_repository(
+  native.new_http_archive(
     name = "grpc",
-    commit = "d7ff4ff40071d2b486a052183e3e9f9382afb745",
-    init_submodules = True,
-    remote = "https://github.com/grpc/grpc.git",
+    url = "http://github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
+    sha256 = "a15f352436ab92c521b1ac11e729e155ace38d0856380cf25048c5d1d9ba8e31",
+    strip_prefix = "grpc-d7ff4ff40071d2b486a052183e3e9f9382afb745",
     build_file = str(Label("//:grpc.BUILD")),
   )
 
@@ -148,9 +159,18 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   )
 
   native.new_git_repository(
+    name = "linenoise",
+    commit = "c894b9e59f02203dbe4e2be657572cf88c4230c3",
+    init_submodules = True,
+    remote = "https://github.com/antirez/linenoise.git",
+    build_file = str(Label("//:linenoise.BUILD")),
+  )
+
+  native.new_http_archive(
     name = "jsoncpp_git",
-    remote = "https://github.com/open-source-parsers/jsoncpp.git",
-    commit = "11086dd6a7eba04289944367ca82cea71299ed70",
+    url = "http://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
+    sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
+    strip_prefix = "jsoncpp-11086dd6a7eba04289944367ca82cea71299ed70",
     build_file = str(Label("//:jsoncpp.BUILD")),
   )
 
@@ -159,16 +179,18 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     actual = "@jsoncpp_git//:jsoncpp",
   )
 
-  native.git_repository(
+  native.http_archive(
     name = "boringssl",
-    remote = "https://github.com/google/boringssl.git",
-    commit = "bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",  # 2016-07-11
+    url = "http://github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",  # 2016-07-11
+    sha256 = "025264d6e9a7ad371f2f66d17a28b6627de0c9592dc2eb54afd062f68f1f9aa3",
+    strip_prefix = "boringssl-bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",
   )
 
-  native.new_git_repository(
+  native.new_http_archive(
     name = "nanopb_git",
-    commit = "1251fa1",
-    remote = "https://github.com/nanopb/nanopb.git",
+    url = "http://github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
+    sha256 = "ab1455c8edff855f4f55b68480991559e51c11e7dab060bbab7cffb12dd3af33",
+    strip_prefix = "nanopb-1251fa1065afc0d62f635e0f63fec8276e14e13c",
     build_file = str(Label("//:nanopb.BUILD")),
   )
 
@@ -181,6 +203,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     name = "avro_archive",
     url = "http://www-us.apache.org/dist/avro/avro-1.8.0/cpp/avro-cpp-1.8.0.tar.gz",
     sha256 = "ec6e2ec957e95ca07f70cc25f02f5c416f47cb27bd987a6ec770dcbe72527368",
+    strip_prefix = "avro-cpp-1.8.0",
     build_file = str(Label("//:avro.BUILD")),
   )
 
@@ -188,6 +211,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     name = "boost_archive",
     url = "http://pilotfiber.dl.sourceforge.net/project/boost/boost/1.61.0/boost_1_61_0.tar.gz",
     sha256 = "a77c7cc660ec02704c6884fbb20c552d52d60a18f26573c9cee0788bf00ed7e6",
+    strip_prefix = "boost_1_61_0",
     build_file = str(Label("//:boost.BUILD")),
   )
 
@@ -195,6 +219,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     name = "bzip2_archive",
     url = "http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz",
     sha256 = "a2848f34fcd5d6cf47def00461fcb528a0484d8edef8208d6d2e2909dc61d9cd",
+    strip_prefix = "bzip2-1.0.6",
     build_file = str(Label("//:bzip2.BUILD")),
   )
 
@@ -202,6 +227,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     name = "zlib_archive",
     url = "http://zlib.net/zlib-1.2.8.tar.gz",
     sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
+    strip_prefix = "zlib-1.2.8",
     build_file = str(Label("//:zlib.BUILD")),
   )
 
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index 15534fa9612..9ab7aadf87a 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -1,8 +1,17 @@
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+# Description:
+#   Eigen is a C++ template library for linear algebra: vectors,
+#   matrices, and related algorithms.
+
+licenses([
+    # Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code.
+    #       We've taken special care to not reference any restricted code.
+    "reciprocal",  # MPL2
+    "notice",  # Portions BSD
+])
 
 cc_library(
     name = "eigen3",
-    hdrs = glob([
+    hdrs = glob(["unsupported/Eigen/CXX11/src/FixedPoint/*.h"]) + [
         "Eigen/Core",
         "Eigen/LU",
         "Eigen/Cholesky",
@@ -12,10 +21,7 @@ cc_library(
         "unsupported/Eigen/SpecialFunctions",
         "unsupported/Eigen/CXX11/Tensor",
         "unsupported/Eigen/CXX11/FixedPoint",
-        "unsupported/Eigen/CXX11/src/FixedPoint/*.h",
-    ]),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@eigen_archive//:eigen",
     ],
+    visibility = ["//visibility:public"],
+    deps = ["@eigen_archive//:eigen"],
 )
diff --git a/third_party/hadoop/BUILD b/third_party/hadoop/BUILD
new file mode 100644
index 00000000000..f25208c4167
--- /dev/null
+++ b/third_party/hadoop/BUILD
@@ -0,0 +1,20 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "hdfs",
+    hdrs = ["hdfs.h"],
+)
diff --git a/third_party/hadoop/hdfs.h b/third_party/hadoop/hdfs.h
new file mode 100644
index 00000000000..560d8bba0e0
--- /dev/null
+++ b/third_party/hadoop/hdfs.h
@@ -0,0 +1,911 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBHDFS_HDFS_H
+#define LIBHDFS_HDFS_H
+
+#include <errno.h>  /* for EINTERNAL, etc. */
+#include <fcntl.h>  /* for O_RDONLY, O_WRONLY */
+#include <stdint.h> /* for uint64_t, etc. */
+#include <time.h>   /* for time_t */
+
+/*
+ * Support export of DLL symbols during libhdfs build, and import of DLL symbols
+ * during client application build.  A client application may optionally define
+ * symbol LIBHDFS_DLL_IMPORT in its build.  This is not strictly required, but
+ * the compiler can produce more efficient code with it.
+ */
+#ifdef WIN32
+#ifdef LIBHDFS_DLL_EXPORT
+#define LIBHDFS_EXTERNAL __declspec(dllexport)
+#elif LIBHDFS_DLL_IMPORT
+#define LIBHDFS_EXTERNAL __declspec(dllimport)
+#else
+#define LIBHDFS_EXTERNAL
+#endif
+#else
+#ifdef LIBHDFS_DLL_EXPORT
+#define LIBHDFS_EXTERNAL __attribute__((visibility("default")))
+#elif LIBHDFS_DLL_IMPORT
+#define LIBHDFS_EXTERNAL __attribute__((visibility("default")))
+#else
+#define LIBHDFS_EXTERNAL
+#endif
+#endif
+
+#ifndef O_RDONLY
+#define O_RDONLY 1
+#endif
+
+#ifndef O_WRONLY
+#define O_WRONLY 2
+#endif
+
+#ifndef EINTERNAL
+#define EINTERNAL 255
+#endif
+
+#define ELASTIC_BYTE_BUFFER_POOL_CLASS \
+  "org/apache/hadoop/io/ElasticByteBufferPool"
+
+/** All APIs set errno to meaningful values */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * Some utility decls used in libhdfs.
+ */
+struct hdfsBuilder;
+typedef int32_t tSize;    /// size of data for read/write io ops
+typedef time_t tTime;     /// time type in seconds
+typedef int64_t tOffset;  /// offset within the file
+typedef uint16_t tPort;   /// port
+typedef enum tObjectKind {
+  kObjectKindFile = 'F',
+  kObjectKindDirectory = 'D',
+} tObjectKind;
+
+/**
+ * The C reflection of org.apache.org.hadoop.FileSystem .
+ */
+struct hdfs_internal;
+typedef struct hdfs_internal *hdfsFS;
+
+struct hdfsFile_internal;
+typedef struct hdfsFile_internal *hdfsFile;
+
+struct hadoopRzOptions;
+
+struct hadoopRzBuffer;
+
+/**
+ * Determine if a file is open for read.
+ *
+ * @param file     The HDFS file
+ * @return         1 if the file is open for read; 0 otherwise
+ */
+LIBHDFS_EXTERNAL
+int hdfsFileIsOpenForRead(hdfsFile file);
+
+/**
+ * Determine if a file is open for write.
+ *
+ * @param file     The HDFS file
+ * @return         1 if the file is open for write; 0 otherwise
+ */
+LIBHDFS_EXTERNAL
+int hdfsFileIsOpenForWrite(hdfsFile file);
+
+struct hdfsReadStatistics {
+  uint64_t totalBytesRead;
+  uint64_t totalLocalBytesRead;
+  uint64_t totalShortCircuitBytesRead;
+  uint64_t totalZeroCopyBytesRead;
+};
+
+/**
+ * Get read statistics about a file.  This is only applicable to files
+ * opened for reading.
+ *
+ * @param file     The HDFS file
+ * @param stats    (out parameter) on a successful return, the read
+ *                 statistics.  Unchanged otherwise.  You must free the
+ *                 returned statistics with hdfsFileFreeReadStatistics.
+ * @return         0 if the statistics were successfully returned,
+ *                 -1 otherwise.  On a failure, please check errno against
+ *                 ENOTSUP.  webhdfs, LocalFilesystem, and so forth may
+ *                 not support read statistics.
+ */
+LIBHDFS_EXTERNAL
+int hdfsFileGetReadStatistics(hdfsFile file, struct hdfsReadStatistics **stats);
+
+/**
+ * @param stats    HDFS read statistics for a file.
+ *
+ * @return the number of remote bytes read.
+ */
+LIBHDFS_EXTERNAL
+int64_t hdfsReadStatisticsGetRemoteBytesRead(
+    const struct hdfsReadStatistics *stats);
+
+/**
+ * Clear the read statistics for a file.
+ *
+ * @param file      The file to clear the read statistics of.
+ *
+ * @return          0 on success; the error code otherwise.
+ *                  EINVAL: the file is not open for reading.
+ *                  ENOTSUP: the file does not support clearing the read
+ *                  statistics.
+ *                  Errno will also be set to this code on failure.
+ */
+LIBHDFS_EXTERNAL
+int hdfsFileClearReadStatistics(hdfsFile file);
+
+/**
+ * Free some HDFS read statistics.
+ *
+ * @param stats    The HDFS read statistics to free.
+ */
+LIBHDFS_EXTERNAL
+void hdfsFileFreeReadStatistics(struct hdfsReadStatistics *stats);
+
+/**
+ * hdfsConnectAsUser - Connect to a hdfs file system as a specific user
+ * Connect to the hdfs.
+ * @param nn   The NameNode.  See hdfsBuilderSetNameNode for details.
+ * @param port The port on which the server is listening.
+ * @param user the user name (this is hadoop domain user). Or NULL is equivelant
+ * to hhdfsConnect(host, port)
+ * @return Returns a handle to the filesystem or NULL on error.
+ * @deprecated Use hdfsBuilderConnect instead.
+ */
+LIBHDFS_EXTERNAL
+hdfsFS hdfsConnectAsUser(const char *nn, tPort port, const char *user);
+
+/**
+ * hdfsConnect - Connect to a hdfs file system.
+ * Connect to the hdfs.
+ * @param nn   The NameNode.  See hdfsBuilderSetNameNode for details.
+ * @param port The port on which the server is listening.
+ * @return Returns a handle to the filesystem or NULL on error.
+ * @deprecated Use hdfsBuilderConnect instead.
+ */
+LIBHDFS_EXTERNAL
+hdfsFS hdfsConnect(const char *nn, tPort port);
+
+/**
+ * hdfsConnect - Connect to an hdfs file system.
+ *
+ * Forces a new instance to be created
+ *
+ * @param nn     The NameNode.  See hdfsBuilderSetNameNode for details.
+ * @param port   The port on which the server is listening.
+ * @param user   The user name to use when connecting
+ * @return       Returns a handle to the filesystem or NULL on error.
+ * @deprecated   Use hdfsBuilderConnect instead.
+ */
+LIBHDFS_EXTERNAL
+hdfsFS hdfsConnectAsUserNewInstance(const char *nn, tPort port,
+                                    const char *user);
+
+/**
+ * hdfsConnect - Connect to an hdfs file system.
+ *
+ * Forces a new instance to be created
+ *
+ * @param nn     The NameNode.  See hdfsBuilderSetNameNode for details.
+ * @param port   The port on which the server is listening.
+ * @return       Returns a handle to the filesystem or NULL on error.
+ * @deprecated   Use hdfsBuilderConnect instead.
+ */
+LIBHDFS_EXTERNAL
+hdfsFS hdfsConnectNewInstance(const char *nn, tPort port);
+
+/**
+ * Connect to HDFS using the parameters defined by the builder.
+ *
+ * The HDFS builder will be freed, whether or not the connection was
+ * successful.
+ *
+ * Every successful call to hdfsBuilderConnect should be matched with a call
+ * to hdfsDisconnect, when the hdfsFS is no longer needed.
+ *
+ * @param bld    The HDFS builder
+ * @return       Returns a handle to the filesystem, or NULL on error.
+ */
+LIBHDFS_EXTERNAL
+hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld);
+
+/**
+ * Create an HDFS builder.
+ *
+ * @return The HDFS builder, or NULL on error.
+ */
+LIBHDFS_EXTERNAL
+struct hdfsBuilder *hdfsNewBuilder(void);
+
+/**
+ * Force the builder to always create a new instance of the FileSystem,
+ * rather than possibly finding one in the cache.
+ *
+ * @param bld The HDFS builder
+ */
+LIBHDFS_EXTERNAL
+void hdfsBuilderSetForceNewInstance(struct hdfsBuilder *bld);
+
+/**
+ * Set the HDFS NameNode to connect to.
+ *
+ * @param bld  The HDFS builder
+ * @param nn   The NameNode to use.
+ *
+ *             If the string given is 'default', the default NameNode
+ *             configuration will be used (from the XML configuration files)
+ *
+ *             If NULL is given, a LocalFileSystem will be created.
+ *
+ *             If the string starts with a protocol type such as file:// or
+ *             hdfs://, this protocol type will be used.  If not, the
+ *             hdfs:// protocol type will be used.
+ *
+ *             You may specify a NameNode port in the usual way by
+ *             passing a string of the format hdfs://<hostname>:<port>.
+ *             Alternately, you may set the port with
+ *             hdfsBuilderSetNameNodePort.  However, you must not pass the
+ *             port in two different ways.
+ */
+LIBHDFS_EXTERNAL
+void hdfsBuilderSetNameNode(struct hdfsBuilder *bld, const char *nn);
+
+/**
+ * Set the port of the HDFS NameNode to connect to.
+ *
+ * @param bld The HDFS builder
+ * @param port The port.
+ */
+LIBHDFS_EXTERNAL
+void hdfsBuilderSetNameNodePort(struct hdfsBuilder *bld, tPort port);
+
+/**
+ * Set the username to use when connecting to the HDFS cluster.
+ *
+ * @param bld The HDFS builder
+ * @param userName The user name.  The string will be shallow-copied.
+ */
+LIBHDFS_EXTERNAL
+void hdfsBuilderSetUserName(struct hdfsBuilder *bld, const char *userName);
+
+/**
+ * Set the path to the Kerberos ticket cache to use when connecting to
+ * the HDFS cluster.
+ *
+ * @param bld The HDFS builder
+ * @param kerbTicketCachePath The Kerberos ticket cache path.  The string
+ *                            will be shallow-copied.
+ */
+LIBHDFS_EXTERNAL
+void hdfsBuilderSetKerbTicketCachePath(struct hdfsBuilder *bld,
+                                       const char *kerbTicketCachePath);
+
+/**
+ * Free an HDFS builder.
+ *
+ * It is normally not necessary to call this function since
+ * hdfsBuilderConnect frees the builder.
+ *
+ * @param bld The HDFS builder
+ */
+LIBHDFS_EXTERNAL
+void hdfsFreeBuilder(struct hdfsBuilder *bld);
+
+/**
+ * Set a configuration string for an HdfsBuilder.
+ *
+ * @param key      The key to set.
+ * @param val      The value, or NULL to set no value.
+ *                 This will be shallow-copied.  You are responsible for
+ *                 ensuring that it remains valid until the builder is
+ *                 freed.
+ *
+ * @return         0 on success; nonzero error code otherwise.
+ */
+LIBHDFS_EXTERNAL
+int hdfsBuilderConfSetStr(struct hdfsBuilder *bld, const char *key,
+                          const char *val);
+
+/**
+ * Get a configuration string.
+ *
+ * @param key      The key to find
+ * @param val      (out param) The value.  This will be set to NULL if the
+ *                 key isn't found.  You must free this string with
+ *                 hdfsConfStrFree.
+ *
+ * @return         0 on success; nonzero error code otherwise.
+ *                 Failure to find the key is not an error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsConfGetStr(const char *key, char **val);
+
+/**
+ * Get a configuration integer.
+ *
+ * @param key      The key to find
+ * @param val      (out param) The value.  This will NOT be changed if the
+ *                 key isn't found.
+ *
+ * @return         0 on success; nonzero error code otherwise.
+ *                 Failure to find the key is not an error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsConfGetInt(const char *key, int32_t *val);
+
+/**
+ * Free a configuration string found with hdfsConfGetStr.
+ *
+ * @param val      A configuration string obtained from hdfsConfGetStr
+ */
+LIBHDFS_EXTERNAL
+void hdfsConfStrFree(char *val);
+
+/**
+ * hdfsDisconnect - Disconnect from the hdfs file system.
+ * Disconnect from hdfs.
+ * @param fs The configured filesystem handle.
+ * @return Returns 0 on success, -1 on error.
+ *         Even if there is an error, the resources associated with the
+ *         hdfsFS will be freed.
+ */
+LIBHDFS_EXTERNAL
+int hdfsDisconnect(hdfsFS fs);
+
+/**
+ * hdfsOpenFile - Open a hdfs file in given mode.
+ * @param fs The configured filesystem handle.
+ * @param path The full path to the file.
+ * @param flags - an | of bits/fcntl.h file flags - supported flags are
+ * O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT),
+ * O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR ||
+ * (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP.
+ * @param bufferSize Size of buffer for read/write - pass 0 if you want
+ * to use the default configured values.
+ * @param replication Block replication - pass 0 if you want to use
+ * the default configured values.
+ * @param blocksize Size of block - pass 0 if you want to use the
+ * default configured values.
+ * @return Returns the handle to the open file or NULL on error.
+ */
+LIBHDFS_EXTERNAL
+hdfsFile hdfsOpenFile(hdfsFS fs, const char *path, int flags, int bufferSize,
+                      short replication, tSize blocksize);
+
+/**
+ * hdfsTruncateFile - Truncate a hdfs file to given lenght.
+ * @param fs The configured filesystem handle.
+ * @param path The full path to the file.
+ * @param newlength The size the file is to be truncated to
+ * @return 1 if the file has been truncated to the desired newlength
+ *         and is immediately available to be reused for write operations
+ *         such as append.
+ *         0 if a background process of adjusting the length of the last
+ *         block has been started, and clients should wait for it to
+ *         complete before proceeding with further file updates.
+ *         -1 on error.
+ */
+int hdfsTruncateFile(hdfsFS fs, const char *path, tOffset newlength);
+
+/**
+ * hdfsUnbufferFile - Reduce the buffering done on a file.
+ *
+ * @param file  The file to unbuffer.
+ * @return      0 on success
+ *              ENOTSUP if the file does not support unbuffering
+ *              Errno will also be set to this value.
+ */
+LIBHDFS_EXTERNAL
+int hdfsUnbufferFile(hdfsFile file);
+
+/**
+ * hdfsCloseFile - Close an open file.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @return Returns 0 on success, -1 on error.
+ *         On error, errno will be set appropriately.
+ *         If the hdfs file was valid, the memory associated with it will
+ *         be freed at the end of this call, even if there was an I/O
+ *         error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsCloseFile(hdfsFS fs, hdfsFile file);
+
+/**
+ * hdfsExists - Checks if a given path exsits on the filesystem
+ * @param fs The configured filesystem handle.
+ * @param path The path to look for
+ * @return Returns 0 on success, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsExists(hdfsFS fs, const char *path);
+
+/**
+ * hdfsSeek - Seek to given offset in file.
+ * This works only for files opened in read-only mode.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @param desiredPos Offset into the file to seek into.
+ * @return Returns 0 on success, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos);
+
+/**
+ * hdfsTell - Get the current offset in the file, in bytes.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @return Current offset, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+tOffset hdfsTell(hdfsFS fs, hdfsFile file);
+
+/**
+ * hdfsRead - Read data from an open file.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @param buffer The buffer to copy read bytes into.
+ * @param length The length of the buffer.
+ * @return      On success, a positive number indicating how many bytes
+ *              were read.
+ *              On end-of-file, 0.
+ *              On error, -1.  Errno will be set to the error code.
+ *              Just like the POSIX read function, hdfsRead will return -1
+ *              and set errno to EINTR if data is temporarily unavailable,
+ *              but we are not yet at the end of the file.
+ */
+LIBHDFS_EXTERNAL
+tSize hdfsRead(hdfsFS fs, hdfsFile file, void *buffer, tSize length);
+
+/**
+ * hdfsPread - Positional read of data from an open file.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @param position Position from which to read
+ * @param buffer The buffer to copy read bytes into.
+ * @param length The length of the buffer.
+ * @return      See hdfsRead
+ */
+LIBHDFS_EXTERNAL
+tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position, void *buffer,
+                tSize length);
+
+/**
+ * hdfsWrite - Write data into an open file.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @param buffer The data.
+ * @param length The no. of bytes to write.
+ * @return Returns the number of bytes written, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void *buffer, tSize length);
+
+/**
+ * hdfsWrite - Flush the data.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @return Returns 0 on success, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsFlush(hdfsFS fs, hdfsFile file);
+
+/**
+ * hdfsHFlush - Flush out the data in client's user buffer. After the
+ * return of this call, new readers will see the data.
+ * @param fs configured filesystem handle
+ * @param file file handle
+ * @return 0 on success, -1 on error and sets errno
+ */
+LIBHDFS_EXTERNAL
+int hdfsHFlush(hdfsFS fs, hdfsFile file);
+
+/**
+ * hdfsHSync - Similar to posix fsync, Flush out the data in client's
+ * user buffer. all the way to the disk device (but the disk may have
+ * it in its cache).
+ * @param fs configured filesystem handle
+ * @param file file handle
+ * @return 0 on success, -1 on error and sets errno
+ */
+LIBHDFS_EXTERNAL
+int hdfsHSync(hdfsFS fs, hdfsFile file);
+
+/**
+ * hdfsAvailable - Number of bytes that can be read from this
+ * input stream without blocking.
+ * @param fs The configured filesystem handle.
+ * @param file The file handle.
+ * @return Returns available bytes; -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsAvailable(hdfsFS fs, hdfsFile file);
+
+/**
+ * hdfsCopy - Copy file from one filesystem to another.
+ * @param srcFS The handle to source filesystem.
+ * @param src The path of source file.
+ * @param dstFS The handle to destination filesystem.
+ * @param dst The path of destination file.
+ * @return Returns 0 on success, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsCopy(hdfsFS srcFS, const char *src, hdfsFS dstFS, const char *dst);
+
+/**
+ * hdfsMove - Move file from one filesystem to another.
+ * @param srcFS The handle to source filesystem.
+ * @param src The path of source file.
+ * @param dstFS The handle to destination filesystem.
+ * @param dst The path of destination file.
+ * @return Returns 0 on success, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsMove(hdfsFS srcFS, const char *src, hdfsFS dstFS, const char *dst);
+
+/**
+ * hdfsDelete - Delete file.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the file.
+ * @param recursive if path is a directory and set to
+ * non-zero, the directory is deleted else throws an exception. In
+ * case of a file the recursive argument is irrelevant.
+ * @return Returns 0 on success, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsDelete(hdfsFS fs, const char *path, int recursive);
+
+/**
+ * hdfsRename - Rename file.
+ * @param fs The configured filesystem handle.
+ * @param oldPath The path of the source file.
+ * @param newPath The path of the destination file.
+ * @return Returns 0 on success, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsRename(hdfsFS fs, const char *oldPath, const char *newPath);
+
+/**
+ * hdfsGetWorkingDirectory - Get the current working directory for
+ * the given filesystem.
+ * @param fs The configured filesystem handle.
+ * @param buffer The user-buffer to copy path of cwd into.
+ * @param bufferSize The length of user-buffer.
+ * @return Returns buffer, NULL on error.
+ */
+LIBHDFS_EXTERNAL
+char *hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize);
+
+/**
+ * hdfsSetWorkingDirectory - Set the working directory. All relative
+ * paths will be resolved relative to it.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the new 'cwd'.
+ * @return Returns 0 on success, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsSetWorkingDirectory(hdfsFS fs, const char *path);
+
+/**
+ * hdfsCreateDirectory - Make the given file and all non-existent
+ * parents into directories.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the directory.
+ * @return Returns 0 on success, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsCreateDirectory(hdfsFS fs, const char *path);
+
+/**
+ * hdfsSetReplication - Set the replication of the specified
+ * file to the supplied value
+ * @param fs The configured filesystem handle.
+ * @param path The path of the file.
+ * @return Returns 0 on success, -1 on error.
+ */
+LIBHDFS_EXTERNAL
+int hdfsSetReplication(hdfsFS fs, const char *path, int16_t replication);
+
+/**
+ * hdfsFileInfo - Information about a file/directory.
+ */
+typedef struct {
+  tObjectKind mKind;  /* file or directory */
+  char *mName;        /* the name of the file */
+  tTime mLastMod;     /* the last modification time for the file in seconds */
+  tOffset mSize;      /* the size of the file in bytes */
+  short mReplication; /* the count of replicas */
+  tOffset mBlockSize; /* the block size for the file */
+  char *mOwner;       /* the owner of the file */
+  char *mGroup;       /* the group associated with the file */
+  short mPermissions; /* the permissions associated with the file */
+  tTime mLastAccess;  /* the last access time for the file in seconds */
+} hdfsFileInfo;
+
+/**
+ * hdfsListDirectory - Get list of files/directories for a given
+ * directory-path. hdfsFreeFileInfo should be called to deallocate memory.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the directory.
+ * @param numEntries Set to the number of files/directories in path.
+ * @return Returns a dynamically-allocated array of hdfsFileInfo
+ * objects; NULL on error.
+ */
+LIBHDFS_EXTERNAL
+hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char *path, int *numEntries);
+
+/**
+ * hdfsGetPathInfo - Get information about a path as a (dynamically
+ * allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be
+ * called when the pointer is no longer needed.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the file.
+ * @return Returns a dynamically-allocated hdfsFileInfo object;
+ * NULL on error.
+ */
+LIBHDFS_EXTERNAL
+hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char *path);
+
+/**
+ * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields)
+ * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
+ * objects.
+ * @param numEntries The size of the array.
+ */
+LIBHDFS_EXTERNAL
+void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries);
+
+/**
+ * hdfsFileIsEncrypted: determine if a file is encrypted based on its
+ * hdfsFileInfo.
+ * @return -1 if there was an error (errno will be set), 0 if the file is
+ *         not encrypted, 1 if the file is encrypted.
+ */
+LIBHDFS_EXTERNAL
+int hdfsFileIsEncrypted(hdfsFileInfo *hdfsFileInfo);
+
+/**
+ * hdfsGetHosts - Get hostnames where a particular block (determined by
+ * pos & blocksize) of a file is stored. The last element in the array
+ * is NULL. Due to replication, a single block could be present on
+ * multiple hosts.
+ * @param fs The configured filesystem handle.
+ * @param path The path of the file.
+ * @param start The start of the block.
+ * @param length The length of the block.
+ * @return Returns a dynamically-allocated 2-d array of blocks-hosts;
+ * NULL on error.
+ */
+LIBHDFS_EXTERNAL
+char ***hdfsGetHosts(hdfsFS fs, const char *path, tOffset start,
+                     tOffset length);
+
+/**
+ * hdfsFreeHosts - Free up the structure returned by hdfsGetHosts
+ * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
+ * objects.
+ * @param numEntries The size of the array.
+ */
+LIBHDFS_EXTERNAL
+void hdfsFreeHosts(char ***blockHosts);
+
+/**
+ * hdfsGetDefaultBlockSize - Get the default blocksize.
+ *
+ * @param fs            The configured filesystem handle.
+ * @deprecated          Use hdfsGetDefaultBlockSizeAtPath instead.
+ *
+ * @return              Returns the default blocksize, or -1 on error.
+ */
+LIBHDFS_EXTERNAL
+tOffset hdfsGetDefaultBlockSize(hdfsFS fs);
+
+/**
+ * hdfsGetDefaultBlockSizeAtPath - Get the default blocksize at the
+ * filesystem indicated by a given path.
+ *
+ * @param fs            The configured filesystem handle.
+ * @param path          The given path will be used to locate the actual
+ *                      filesystem.  The full path does not have to exist.
+ *
+ * @return              Returns the default blocksize, or -1 on error.
+ */
+LIBHDFS_EXTERNAL
+tOffset hdfsGetDefaultBlockSizeAtPath(hdfsFS fs, const char *path);
+
+/**
+ * hdfsGetCapacity - Return the raw capacity of the filesystem.
+ * @param fs The configured filesystem handle.
+ * @return Returns the raw-capacity; -1 on error.
+ */
+LIBHDFS_EXTERNAL
+tOffset hdfsGetCapacity(hdfsFS fs);
+
+/**
+ * hdfsGetUsed - Return the total raw size of all files in the filesystem.
+ * @param fs The configured filesystem handle.
+ * @return Returns the total-size; -1 on error.
+ */
+LIBHDFS_EXTERNAL
+tOffset hdfsGetUsed(hdfsFS fs);
+
+/**
+ * Change the user and/or group of a file or directory.
+ *
+ * @param fs            The configured filesystem handle.
+ * @param path          the path to the file or directory
+ * @param owner         User string.  Set to NULL for 'no change'
+ * @param group         Group string.  Set to NULL for 'no change'
+ * @return              0 on success else -1
+ */
+LIBHDFS_EXTERNAL
+int hdfsChown(hdfsFS fs, const char *path, const char *owner,
+              const char *group);
+
+/**
+ * hdfsChmod
+ * @param fs The configured filesystem handle.
+ * @param path the path to the file or directory
+ * @param mode the bitmask to set it to
+ * @return 0 on success else -1
+ */
+LIBHDFS_EXTERNAL
+int hdfsChmod(hdfsFS fs, const char *path, short mode);
+
+/**
+ * hdfsUtime
+ * @param fs The configured filesystem handle.
+ * @param path the path to the file or directory
+ * @param mtime new modification time or -1 for no change
+ * @param atime new access time or -1 for no change
+ * @return 0 on success else -1
+ */
+LIBHDFS_EXTERNAL
+int hdfsUtime(hdfsFS fs, const char *path, tTime mtime, tTime atime);
+
+/**
+ * Allocate a zero-copy options structure.
+ *
+ * You must free all options structures allocated with this function using
+ * hadoopRzOptionsFree.
+ *
+ * @return            A zero-copy options structure, or NULL if one could
+ *                    not be allocated.  If NULL is returned, errno will
+ *                    contain the error number.
+ */
+LIBHDFS_EXTERNAL
+struct hadoopRzOptions *hadoopRzOptionsAlloc(void);
+
+/**
+ * Determine whether we should skip checksums in read0.
+ *
+ * @param opts        The options structure.
+ * @param skip        Nonzero to skip checksums sometimes; zero to always
+ *                    check them.
+ *
+ * @return            0 on success; -1 plus errno on failure.
+ */
+LIBHDFS_EXTERNAL
+int hadoopRzOptionsSetSkipChecksum(struct hadoopRzOptions *opts, int skip);
+
+/**
+ * Set the ByteBufferPool to use with read0.
+ *
+ * @param opts        The options structure.
+ * @param className   If this is NULL, we will not use any
+ *                    ByteBufferPool.  If this is non-NULL, it will be
+ *                    treated as the name of the pool class to use.
+ *                    For example, you can use
+ *                    ELASTIC_BYTE_BUFFER_POOL_CLASS.
+ *
+ * @return            0 if the ByteBufferPool class was found and
+ *                    instantiated;
+ *                    -1 plus errno otherwise.
+ */
+LIBHDFS_EXTERNAL
+int hadoopRzOptionsSetByteBufferPool(struct hadoopRzOptions *opts,
+                                     const char *className);
+
+/**
+ * Free a hadoopRzOptionsFree structure.
+ *
+ * @param opts        The options structure to free.
+ *                    Any associated ByteBufferPool will also be freed.
+ */
+LIBHDFS_EXTERNAL
+void hadoopRzOptionsFree(struct hadoopRzOptions *opts);
+
+/**
+ * Perform a byte buffer read.
+ * If possible, this will be a zero-copy (mmap) read.
+ *
+ * @param file       The file to read from.
+ * @param opts       An options structure created by hadoopRzOptionsAlloc.
+ * @param maxLength  The maximum length to read.  We may read fewer bytes
+ *                   than this length.
+ *
+ * @return           On success, we will return a new hadoopRzBuffer.
+ *                   This buffer will continue to be valid and readable
+ *                   until it is released by readZeroBufferFree.  Failure to
+ *                   release a buffer will lead to a memory leak.
+ *                   You can access the data within the hadoopRzBuffer with
+ *                   hadoopRzBufferGet.  If you have reached EOF, the data
+ *                   within the hadoopRzBuffer will be NULL.  You must still
+ *                   free hadoopRzBuffer instances containing NULL.
+ *
+ *                   On failure, we will return NULL plus an errno code.
+ *                   errno = EOPNOTSUPP indicates that we could not do a
+ *                   zero-copy read, and there was no ByteBufferPool
+ *                   supplied.
+ */
+LIBHDFS_EXTERNAL
+struct hadoopRzBuffer *hadoopReadZero(hdfsFile file,
+                                      struct hadoopRzOptions *opts,
+                                      int32_t maxLength);
+
+/**
+ * Determine the length of the buffer returned from readZero.
+ *
+ * @param buffer     a buffer returned from readZero.
+ * @return           the length of the buffer.
+ */
+LIBHDFS_EXTERNAL
+int32_t hadoopRzBufferLength(const struct hadoopRzBuffer *buffer);
+
+/**
+ * Get a pointer to the raw buffer returned from readZero.
+ *
+ * To find out how many bytes this buffer contains, call
+ * hadoopRzBufferLength.
+ *
+ * @param buffer     a buffer returned from readZero.
+ * @return           a pointer to the start of the buffer.  This will be
+ *                   NULL when end-of-file has been reached.
+ */
+LIBHDFS_EXTERNAL
+const void *hadoopRzBufferGet(const struct hadoopRzBuffer *buffer);
+
+/**
+ * Release a buffer obtained through readZero.
+ *
+ * @param file       The hdfs stream that created this buffer.  This must be
+ *                   the same stream you called hadoopReadZero on.
+ * @param buffer     The buffer to release.
+ */
+LIBHDFS_EXTERNAL
+void hadoopRzBufferFree(hdfsFile file, struct hadoopRzBuffer *buffer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef LIBHDFS_EXTERNAL
+#endif /*LIBHDFS_HDFS_H*/
+
+/**
+ * vim: ts=4: sw=4: et
+ */
diff --git a/zlib.BUILD b/zlib.BUILD
index 9e0ce538788..edb77fdf8ee 100644
--- a/zlib.BUILD
+++ b/zlib.BUILD
@@ -2,11 +2,35 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # BSD/MIT-like license (for zlib)
 
-prefix_dir = "zlib-1.2.8"
-
 cc_library(
     name = "zlib",
-    srcs = glob([prefix_dir + "/*.c"]),
-    hdrs = glob([prefix_dir + "/*.h"]),
-    includes = [prefix_dir],
+    srcs = [
+        "adler32.c",
+        "compress.c",
+        "crc32.c",
+        "crc32.h",
+        "deflate.c",
+        "deflate.h",
+        "gzclose.c",
+        "gzguts.h",
+        "gzlib.c",
+        "gzread.c",
+        "gzwrite.c",
+        "infback.c",
+        "inffast.c",
+        "inffast.h",
+        "inffixed.h",
+        "inflate.c",
+        "inflate.h",
+        "inftrees.c",
+        "inftrees.h",
+        "trees.c",
+        "trees.h",
+        "uncompr.c",
+        "zconf.h",
+        "zutil.c",
+        "zutil.h",
+    ],
+    hdrs = ["zlib.h"],
+    includes = ["."],
 )